Each type of factor has its own exploratory data analysis (EDA) notebook. This notebook combines all the types of factors into one type of model to explore their collective impact on stock price behavior. This notebook is broken into the following sections: <br>

* Imports and installs
* Functions
* Download and preprocess data
* Combine data into one master dataframe
* Baseline Random Walk Model
* LSTM Model with past prices
* Random Forest Model with all features
* Random Forest Model with selected features

# Imports and Installs

In [None]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
# Admin
import requests
from google.colab import files
from datetime import datetime
from tqdm import tqdm
from tqdm.notebook import tqdm
import time

import ast
import pandas as pd
import numpy as np
import yfinance as yf
from functools import reduce

from sklearn.model_selection import TimeSeriesSplit
from statsmodels.api import OLS, GLS, add_constant
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, make_scorer, mean_squared_error
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan
from scipy.stats import pearsonr
import scipy.stats as stats

# LSTM
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
import math
from sklearn.model_selection import train_test_split
import kerastuner
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from keras_tuner.tuners import RandomSearch
from kerastuner import HyperModel, RandomSearch
from keras_tuner import HyperParameters, Objective
import keras_tuner

# Sentiment
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import nltk
from nltk.tokenize import sent_tokenize
from scipy.stats import mannwhitneyu

# Random Forest
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Visualizations
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Functions

In [None]:
def download_stock_prices(ticker_symbol, start):
  """
  Downloads historical adjusted closing prices for a given stock ticker.

  Parameters:
  - ticker_symbol: The stock ticker symbol as a string.

  Returns:
  - prices_df: A DataFrame containing the adjusted closing prices.
  """
  start = start
  today_date = datetime.now().strftime('%Y-%m-%d')
  end = '2024-03-06'

  # Download historical data
  price_data = yf.download(ticker_symbol, start=start, end=end)

  # Extract adjusted closing prices
  adj_close_prices = price_data['Adj Close']

  # Create a DataFrame with the adjusted closing prices
  prices_df = pd.DataFrame(adj_close_prices)

  return prices_df

In [None]:
def interpolate_daily_values(column):
  # Create a dataframe
  daily_df = pd.DataFrame(columns = ['Adj Close', 'Daily Avg'])

  for i in range(len(column)):
    # Get the index value of the input column
    a = column.index[i]

    # Find the index position in the prices dataframe that is closest to the index value in the revenue dataframe
    idx_pos = prices.index.searchsorted(a)

    # Get the row number of the index value in the prices dataframe
    row_index = idx_pos + 1

    # Calculate average daily value by dividing the value of the input column by the number of rows (days) in the prices dataframe
    average_daily_value = column.iloc[i] / (row_index-len(daily_df))

    # Create a smaller dataframe with data for only quarter i that will be concatenated to the daily_df
    sub_df = pd.DataFrame(columns=['Adj Close', 'Daily Avg'])
    sub_df['Adj Close'] = prices.iloc[len(daily_df):row_index]
    sub_df['Daily Avg'] = average_daily_value


    daily_df = pd.concat([daily_df, sub_df])

  daily_df.drop(columns = ['Adj Close'])

  return daily_df

In [None]:
def download_and_clean_transcripts():
  transcripts_df = pd.DataFrame()
  year_list = [2020, 2021, 2022, 2023,2024]
  api_key = ''

  for year in year_list:
    url = f'https://financialmodelingprep.com/api/v4/batch_earning_call_transcript/MSFT?year={year}&apikey={api_key}'
    data = requests.get(url).json()
    if data and isinstance(data, list):
      df = pd.DataFrame(data)
      transcripts_df = transcripts_df.append(df, ignore_index=True)

  transcripts_df = transcripts_df.drop(columns=['symbol', 'quarter', 'year'])
  transcripts_df['date'] = pd.to_datetime(transcripts_df['date']).dt.date
  transcripts_df['content'] = transcripts_df['content'].str.replace('\n', '', regex=False)
  transcripts_df = transcripts_df.rename(columns={'date': 'Date', 'content': 'Text'})
  transcripts_df = transcripts_df.sort_values(by='Date', ascending=True)

  return transcripts_df

In [None]:
# Download the finBERT model,
# This will output a sentiment label and score for a dataframe with 'Date' and 'Text' columns

# Initialize the tokenizer and model for ProsusAI/finBERT
tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finBERT')
model = AutoModelForSequenceClassification.from_pretrained('ProsusAI/finBERT')

# Create a sentiment analysis pipeline using the finBERT model
finbert_sentiment = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Define a function to apply sentiment analysis to each row
def get_sentiment(row):

    results = finbert_sentiment(row['Text'], truncation=True, max_length=512)

    return pd.Series([row['Text'], results[0]['label'], results[0]['score']])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
def calculate_sentiment_scores(transcripts_df):

  ticker_avg_sentiment = []

  for i in tqdm(range(len(transcripts_df))):
    transcript = transcripts_df['Text'].iloc[i]
    sentences = sent_tokenize(transcript)

    # skip calls that have less than 4 tokens
    if len(sentences) < 4:
      continue

    # Trim first two and last 2 items from sentence list
    sentences = sentences[2:-2]
    # Create a dataframe to calculate the score for each item in the sentence list.
    sentences_df = pd.DataFrame(sentences, columns=['Text'])
    sentiment_results = sentences_df.apply(get_sentiment, axis=1)
    sentiment_results.columns = ['Sentence', 'Label', 'Score']
    # Remove neutral scores
    sentiment_results = sentiment_results[sentiment_results['Label'] != 'neutral']
    # Remove sentences with low sentiment
    rows_to_drop = int(len(sentiment_results) * 0.3)
    sentiment_results = sentiment_results.sort_values(by = 'Score', ascending = True).iloc[rows_to_drop:]
    # Convert negative sentiment scores
    sentiment_results['Score'] = sentiment_results.apply(lambda row: -row['Score'] if row['Label'] == 'negative' else row['Score'], axis=1)
    # Calculate average sentiment score for the call
    avg_sentiment = sum(sentiment_results['Score'])/len(sentiment_results)
    ticker_avg_sentiment.append(avg_sentiment)

  transcripts_df['Sentiment'] = ticker_avg_sentiment

  return transcripts_df

In [None]:
def mean_return_over_n_days(df, N):
  """
  Calculates the price returns over the previous N days for the 'Adj Close' column in a DataFrame.

  Parameters:
  - df: A pandas DataFrame containing the 'Adj Close' column with prices.
  - N: The number of days over which to calculate the returns.

  Returns:
  - The mean value of the 'Return Over N Days' column.
  """
  # Calculate the percentage change over the previous N days and store it in a new column
  df['Return Over N Days'] = df['Adj Close'].pct_change(periods=N)

  # Calculate the mean value of the 'Return Over N Days' column
  mean_return_over_n_days = round(df['Return Over N Days'].mean(),4)

  return mean_return_over_n_days

In [None]:
def shifted_returns_df(df, N):
  """
  Creates a new DataFrame with price returns over N days and shifted returns.

  Parameters:
  - df: A pandas DataFrame containing an 'Adj Close' column with prices.
  - N: The number of days over which to calculate the returns.

  Returns:
  - A new DataFrame 'shifted_returns' with columns 'Adj Close', 'Return Over N Days',
    and 'Shifted Return Over N Days'.
  """
  # Initialize the new DataFrame
  shifted_returns = pd.DataFrame()
  shifted_returns['Adj Close'] = df['Adj Close']

  # Calculate the percentage change over the previous N days for 'Return Over N Days'
  shifted_returns['Return Over N Days'] = df['Adj Close'].pct_change(periods=N)

  # Shift the 'Return Over N Days' by N days to create 'Shifted Return Over N Days'
  shifted_returns['Shifted Return Over N Days'] = shifted_returns['Return Over N Days'].shift(-N)

  return shifted_returns

In [None]:
class LSTMHyperModel(HyperModel):
  def __init__(self, input_shape):
    self.input_shape = input_shape

  def build(self, hp):
    model = Sequential()
    num_lstm_layers = hp.Int('num_lstm_layers', min_value=1, max_value=6, step=1)

    for i in range(num_lstm_layers):
      if i == 0:
        # First layer needs to specify input shape
        model.add(LSTM(units=hp.Int(f'lstm_units_{i}', min_value=32, max_value=512, step=32),
                      input_shape=self.input_shape,
                      return_sequences=(i != num_lstm_layers - 1)))
      else:
        # Subsequent layers
        model.add(LSTM(units=hp.Int(f'lstm_units_{i}', min_value=32, max_value=512, step=32),
                      return_sequences=(i != num_lstm_layers - 1)))

      model.add(Dropout(rate=hp.Float(f'dropout_{i}', min_value=0.0, max_value=0.5, step=0.05)))

    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])
    return model

In [None]:
def lstm_model_with_tuning(df, target_column, feature_columns):
  if isinstance(feature_columns, str):
    feature_columns = [feature_columns]

  X = df[feature_columns].values
  y = df[target_column].values.reshape(-1, 1)
  num_features = len(feature_columns)
  input_shape = (1, num_features)

  tscv = TimeSeriesSplit(n_splits=4)
  rmse_scores = []

  for train_index, test_index in tscv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_train_scaled = scaler_X.fit_transform(X_train).reshape(-1, 1, num_features)
    X_test_scaled = scaler_X.transform(X_test).reshape(-1, 1, num_features)
    y_train_scaled = scaler_y.fit_transform(y_train)
    y_test_scaled = scaler_y.transform(y_test)

    model_builder = LSTMHyperModel(input_shape=input_shape)
    tuner = RandomSearch(model_builder,
                          objective='mse',
                          max_trials=15,
                          executions_per_trial=1,
                          directory='my_dir',
                          project_name='my_project')

    tuner.search(X_train_scaled, y_train_scaled, epochs=10, batch_size=64, validation_data=(X_test_scaled, y_test_scaled))
    best_model = tuner.get_best_models(num_models=1)[0]

    predictions_scaled = best_model.predict(X_test_scaled)
    predictions = scaler_y.inverse_transform(predictions_scaled)
    rmse = math.sqrt(mean_squared_error(y_test, predictions))
    rmse_scores.append(rmse)

  # Forecasting
  forecasts = []
  for i in range(1, 3):
    last_input = df[feature_columns].values[-i].reshape(1, -1)
    last_input_scaled = scaler_X.transform(last_input).reshape(-1, 1, num_features)
    forecast_scaled = best_model.predict(last_input_scaled)
    forecast = scaler_y.inverse_transform(forecast_scaled)
    forecasts.append(forecast[0][0])

  results_df = pd.DataFrame({
      'Average RMSE': [np.mean(rmse_scores)],
      'Forecast March 5, 2024': forecasts[1],
      'Forecast March 6, 2024': forecasts[0]
  })

  return results_df

# Download Data

## Stock Price Data

Data obtained from Yahoo! Finance using yfinance package

In [None]:
msft_df = download_stock_prices('MSFT', '2020-01-01')
msft_df.head()

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2020-01-02,154.493851
2020-01-03,152.570129
2020-01-06,152.964462
2020-01-07,151.569794
2020-01-08,153.984055


In [None]:
fig = px.line(msft_df, y='Adj Close', title='MSFT Adjusted Close Price')

fig.update_xaxes(title_text='Date')
fig.update_yaxes(title_text='Adjusted Close Price')
fig.update_traces(line=dict(color="#00274C"))

fig.show()

## Implied Volatility

Implied volatility is a measure used to estimate the degree of future price variability of a stock, based on the prices of its options. It reflects investors' predictions about the asset's potential movement but doesn't indicate the direction of the movement.<br>Data obtained from Nasdaq Data Link (formerly Quandl) using an API key.

In [None]:
implied_volatility = pd.read_csv('msft_implied_volatility_raw.csv')
implied_volatility['date'] = pd.to_datetime(implied_volatility['date'])
implied_volatility.rename(columns={'date': 'Date'}, inplace=True)
implied_volatility.set_index('Date', inplace=True)
implied_volatility = implied_volatility.drop(columns = 'ticker')
implied_volatility = implied_volatility.sort_values(by='Date', ascending=True)

In [None]:
merged_df = msft_df.merge(implied_volatility, on='Date', how='inner')
fig = px.scatter(merged_df, x='ivmean30', y='Adj Close', title='30 Day Implied Volatility vs MSFT Adjusted Close Price', trendline="lowess")
fig.update_xaxes(title_text='IV Mean 30')
fig.update_yaxes(title_text='MSFT Adjusted Close Price')

fig.show()

## CBOE Volatility

The CBOE Volatility Index (VIX), often called the 'fear gauge' is a popular measure that represents the stock market's expectation of volatility over the next 30 days, based on options prices of the S&P 500 index. While the VIX measures the market's overall volatility expectations, implied volatility refers to the expected volatility of a specific security or asset, in this case MSFT

In [None]:
# Set starting and ending dates
start_date = '2020-01-01'
end_date = '2024-03-05'

# List of ticker symbols
vix_tickers = ['^VIX9D', '^VIX', '^VIX3M', '^VIX6M']

# Create an empty DataFrame with a full date index
vix_df = pd.DataFrame(index=pd.date_range(start_date, end_date))

# Loop through each ticker symbol
for vix_ticker in vix_tickers:
    try:
        # Download the data for the current ticker
        price = yf.download(vix_ticker, start=start_date, end=end_date)['Adj Close']

        # Add the downloaded data to the DataFrame using ffill to fill forward missing values
        vix_df[vix_ticker] = price.reindex(vix_df.index, method='ffill').fillna(method='ffill')
    except Exception as e:
        print(f"Error downloading data for {vix_ticker}: {e}")


vix_df = round(vix_df, 4)

vix_df = vix_df.rename_axis('Date')

common_index = msft_df.index.intersection(vix_df.index)

vix_df = vix_df.loc[common_index]

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


## Stock Grade

The stock rating provided by hedge funds, investment firms, and analysts. Data sourced from Financial Modeling Prep using an API Key

In [None]:
grade_df = pd.read_csv('msft_grade_raw.csv')
grade_df = grade_df.drop(columns = ['symbol', 'gradingCompany'])

grade_to_category = {
    'Sell': 1, 'Underperform': 1, 'Underweight': 1,
    'Hold': 2, 'Neutral': 2, 'Market perform': 2, 'Perform': 2, 'Equal-Weight': 2, 'Sector Perform': 2, 'Equal-weight': 2,
    'Overweight': 3, 'Buy': 3, 'Outperform': 3,
    'Strong Buy': 4, 'Long-term buy': 4, 'Long-Term Buy': 4
}

# Function to map grade to category
def map_grade_to_category(grade):
  return grade_to_category.get(grade, None)


def calculate_grade_change(row):

  if pd.isnull(row['previousGrade']) or row['previousGrade'] == '':
    return map_grade_to_category(row['newGrade'])
  else:
    prev_cat = map_grade_to_category(row['previousGrade'])
    new_cat = map_grade_to_category(row['newGrade'])
    if prev_cat is not None and new_cat is not None:
      return new_cat - prev_cat
    else:
      return None

# Apply the function row-wise
grade_df['Grade Change'] = grade_df.apply(calculate_grade_change, axis=1)

def map_new_grade_to_category(new_grade):
  return grade_to_category.get(new_grade, None)
# Apply the function to the 'newGrade' column to create the new 'Grade' column
grade_df['Grade'] = grade_df['newGrade'].apply(map_new_grade_to_category)

grade_df.rename(columns={'date': 'Date'}, inplace=True)

grade_df['Date'] = pd.to_datetime(grade_df['Date'])

grade_df = grade_df.groupby('Date')[['Grade Change', 'Grade']].mean().reset_index()

grade_df = grade_df[grade_df['Date'] > pd.Timestamp('2019-12-31')]

grade_df = grade_df.set_index('Date').dropna()


In [None]:
expanded_grade_df = grade_df.reindex(msft_df.index, method='ffill')
merged_df = msft_df.merge(expanded_grade_df, left_index=True, right_index=True, how='left')
grade_df = merged_df.drop('Adj Close', axis = 1)
grade_df.iloc[:4] = grade_df.iloc[4]

## Analyst Recommendation

This is an analyst's opinion on the future performance of MSFT stock, expressed as strong sell, strong buy, buy, hold, or sell based on analysis of the Microsoft's fundamentals, industry position, and market conditions.

In [None]:
df = pd.read_csv('msft_analyst_recommendations_raw.csv')

df['analystRecommendations'] = (
    "Buy: " + df['analystRatingsbuy'].astype(str) +
    ", Hold: " + df['analystRatingsHold'].astype(str) +
    ", Sell: " + df['analystRatingsSell'].astype(str) +
    ", Strong Sell: " + df['analystRatingsStrongSell'].astype(str) +
    ", Strong Buy: " + df['analystRatingsStrongBuy'].astype(str)
)

weights = {
    'analystRatingsStrongSell': -2,
    'analystRatingsSell': -1,
    'analystRatingsHold': 0,
    'analystRatingsbuy': 1,
    'analystRatingsStrongBuy': 2
}

# Calculating weighted score
for column, weight in weights.items():
    df[column + '_weighted'] = df[column] * weight

# Sum the weighted scores into a single score
df['weightedScore'] = df[[col + '_weighted' for col in weights]].sum(axis=1)



In [None]:
recommendation_score = (
    df[['date', 'weightedScore']]  # Select relevant columns
    .rename(columns={'date': 'Date'})  # Rename 'date' to 'Date'
    .assign(Date=lambda x: pd.to_datetime(x['Date']))  # Convert 'Date' to datetime
    .query("Date > '2019-12-31'")  # Filter dates after 2019-12-31
    .sort_values('Date', ascending=True)  # Sort by 'Date'
    .set_index('Date')
    .dropna()
)

expanded_rec_score = recommendation_score.reindex(msft_df.index, method='ffill')
merged_df = msft_df.merge(expanded_rec_score, left_index=True, right_index=True, how='left')
recommendation_score = merged_df.drop('Adj Close', axis = 1)

## Trading Volume

The total number of stocks traded on a daily basis, indicating the overall activity and liquidity for MSFT in the market. <br>Data sourced from Yahoo! Finance

In [None]:
ticker_symbol = 'MSFT'
start_date = '2020-01-01'
end_date = '2024-03-06'
volume = yf.download(ticker_symbol, start=start_date, end=end_date)
volume = volume.drop(columns = ['Open','High','Low','Close','Adj Close'])

[*********************100%%**********************]  1 of 1 completed


## Income Statements (Quarterly)

Income statements for each quarter. The following code resamples this to daily amounts

In [None]:
quarterly_income_statements = pd.read_csv('msft_q_income_statements_raw.csv')
daily_is = (quarterly_income_statements
            .drop(columns=['weightedAverageShsOut', 'weightedAverageShsOutDil',
                           'grossProfitRatio', 'ebitdaratio', 'operatingIncomeRatio',
                           'incomeBeforeTaxRatio', 'netIncomeRatio','fillingDate'])
            .rename(columns={'date': 'Date'})
            .assign(Date=lambda x: pd.to_datetime(x['Date']))
            .sort_values(by='Date', ascending=True)
            .set_index('Date')
            .dropna())

In [None]:
prices = msft_df
ful_daily_is = pd.DataFrame(index=daily_is.index)

for column_name in daily_is.columns:
  # Use the function on each column
  interpolated_df = interpolate_daily_values(daily_is[column_name])

  # Drop the 'Adj Close' column from the output
  if 'Adj Close' in interpolated_df.columns:
    interpolated_df.drop(columns=['Adj Close'], inplace=True)

  # Change the column title of 'Daily Avg' to the name of the column from the daily_is DataFrame
  interpolated_df.rename(columns={'Daily Avg': column_name}, inplace=True)

  # Concatenate this output to ful_daily_is
  ful_daily_is = pd.concat([ful_daily_is, interpolated_df], axis=1)

daily_is = ful_daily_is
daily_is = daily_is.dropna()

In [None]:
combined_index = daily_is.index.union(msft_df.index)
daily_is_reindexed = daily_is.reindex(combined_index, method='ffill')

The income statement data ends at December, 2023. We will use the average growth rate from the 4th Quarter of 2023 to extrapolate values for January - March 2024 using income statement growth data from Financial Modeling Prep.

In [None]:
api_key = ''
url = f'https://financialmodelingprep.com/api/v3/income-statement-growth/MSFT?period=quarter&apikey={api_key}'
data = requests.get(url).json()


In [None]:
data = pd.read_csv('msft_is_growth.csv')
data = pd.DataFrame(data)

In [None]:
data = data.drop(columns= ['date', 'symbol', 'calendarYear', 'period',
                  'growthGrossProfitRatio', 'growthEBITDARatio',
                  'growthOperatingIncomeRatio', 'growthIncomeBeforeTaxRatio',
                  'growthNetIncomeRatio', 'growthWeightedAverageShsOut',
                  'growthWeightedAverageShsOutDil'])


In [None]:
# Some columns in the income statement don't have a corresponding column in the income statement growth table.
# For this we will use the average growth rate across all columns for that quarter.
average = data.iloc[0].mean()
average

0.10087824294736843

In [None]:
growth_to_column_mapping = {
    'growthRevenue': 'revenue',
    'growthCostOfRevenue': 'costOfRevenue',
    'growthGrossProfit': 'grossProfit',
    'growthResearchAndDevelopmentExpenses': 'researchAndDevelopmentExpenses',
    'growthGeneralAndAdministrativeExpenses': 'generalAndAdministrativeExpenses',
    'growthSellingAndMarketingExpenses': 'sellingAndMarketingExpenses',
    'growthOtherExpenses': 'otherExpenses',
    'growthOperatingExpenses': 'operatingExpenses',
    'growthCostAndExpenses': 'costAndExpenses',
    'growthInterestExpense': 'interestExpense',
    'growthDepreciationAndAmortization': 'depreciationAndAmortization',
    'growthEBITDA': 'ebitda',
    'growthOperatingIncome': 'operatingIncome',
    'growthTotalOtherIncomeExpensesNet': 'totalOtherIncomeExpensesNet',
    'growthIncomeBeforeTax': 'incomeBeforeTax',
    'growthIncomeTaxExpense': 'incomeTaxExpense',
    'growthNetIncome': 'netIncome',
    'growthEPS': 'eps',
    'growthEPSDiluted': 'epsdiluted'
}

growth_rates_df = pd.DataFrame(index=daily_is.index, columns=daily_is.columns)

# Populate growth_rates_df with growth rates using the mapping
for growth_key, column_name in growth_to_column_mapping.items():
  if column_name in growth_rates_df.columns:
    # Assign the value from data[0] to the correct column in growth_rates_df
    growth_rates_df[column_name] = data.iloc[0].get(growth_key, None)
growth_rates_df = growth_rates_df.iloc[-1]
growth_rates_df.fillna(average, inplace=True)

In [None]:
daily_is_complete = pd.read_csv('msft_daily_income_statements.csv', parse_dates=['Date']).set_index('Date')

Income statement line items like those relating to shares and ratios can't be interpolated using the method above, but should be forward filled.

In [None]:
shares_ratios = pd.DataFrame(data = quarterly_income_statements[['weightedAverageShsOut', 'weightedAverageShsOutDil',
                           'grossProfitRatio', 'ebitdaratio', 'operatingIncomeRatio',
                           'incomeBeforeTaxRatio', 'netIncomeRatio','date']] ).dropna().rename(columns={'date': 'Date'}).set_index('Date')

growth_rates = shares_ratios.pct_change()

# Calculate the average of these growth rates for each column
average_growth_rates = growth_rates.mean()

shares_ratios.index = pd.to_datetime(shares_ratios.index)
shares_ratios = shares_ratios.reindex(msft_df.index)
shares_ratios = shares_ratios.ffill()

In [None]:
# Since the data starts on March, 2020, the previous values need to be backfilled.
# Calculate the adjusted values for backfilling using the values at index 61
values_at_62nd_row = shares_ratios.iloc[61]
adjusted_values = values_at_62nd_row / (1 - average_growth_rates)

# Create a DataFrame with adjusted values repeated for the first 61 rows
adjusted_df = pd.DataFrame([adjusted_values] * 61, index=shares_ratios.index[:61])

shares_ratios.update(adjusted_df)

shares_ratios.ffill(inplace=True)

## Balance Sheet

Provides a snapshot of a company's financial position at a specific point in time. Data obtained from Financial Modeling Prep.

In [None]:
qbs = pd.read_csv('msft_q_balance_sheet.csv')

In [None]:
dbs = (qbs.assign(Date=pd.to_datetime(qbs['Date']))
         .set_index('Date')
         .reindex(msft_df.index)
         .interpolate(method='linear'))

In [None]:
starting_values = qbs.iloc[0]
ending_values = dbs.iloc[61]

# Calculate the number of points to interpolate, including the start and end
num_points = 62

# Step 2: Generate interpolated values for the first 61 rows
for column in dbs.columns:
  # Calculate slope (m)
  slope = (ending_values[column] - starting_values[column]) / (num_points - 1)

  # Calculate y-intercept (b), using y = mx + b, where x = 0 for the start
  intercept = starting_values[column]

  # Generate interpolated values for each point
  interpolated_values = [intercept + slope * x for x in range(num_points)]

  # Fill in the NaN values in the first 61 rows
  dbs[column].iloc[:62] = interpolated_values

# Ensure interpolation didn't unintentionally modify values beyond the first 62 rows
dbs.iloc[62:] = dbs.iloc[62:].ffill()

## Cash Flow Statement

A summary of a company's cash inflows and outflows over a period of time.

In [None]:
qcf = pd.read_csv('msft_q_cash_flow.csv')
qcf = (qcf.assign(Date=pd.to_datetime(qcf['Date']))
         .set_index('Date'))
qcf = qcf.iloc[1:]

In [None]:
prices = msft_df
ful_daily_cf = pd.DataFrame(index=prices.index)

for column_name in qcf.columns:

  interpolated_df = interpolate_daily_values(qcf[column_name])

  if 'Adj Close' in interpolated_df.columns:
    interpolated_df.drop(columns=['Adj Close'], inplace=True)

  interpolated_df.rename(columns={'Daily Avg': column_name}, inplace=True)

  ful_daily_cf = pd.concat([ful_daily_cf, interpolated_df], axis=1)

dcf = ful_daily_cf
dcf = dcf.dropna()

In [None]:
daily_cf = pd.read_csv('msft_daily_cash_flow.csv', parse_dates=['Date']).set_index('Date')

## Earnings Surprise

The difference between actual earnings per share and earnings expected by analysts prior to earnings release. Data obtained from Financial Modeling Prep.

In [None]:
surprise = pd.read_csv('msft_earnings_surprise.csv')
surprise = (surprise.assign(Date=pd.to_datetime(surprise['Date']))
         .set_index('Date')
         .reindex(msft_df.index)
         .fillna(0))

## Earnings Call Transcripts

Transcripts from quarterly earnings calls. Data obtained from AlphaVantage.

In [None]:
# Download data from AlphaVantage and save to csv file
relevant_data = []

# Define start and end dates as datetime objects
start_date = datetime(2022, 3, 1)
end_date = datetime(2024, 3, 6)

# Define API key
api_key = ''

# Loop through date range with 1-day increments
while start_date <= end_date:
  # Convert start and end dates to string format
  time_from = start_date.strftime('%Y%m%d')
  time_to = (start_date + timedelta(days=1)).strftime('%Y%m%d')

  url = f'https://www.alphavantage.co/query?function=NEWS_SENTIMENT&tickers=MSFT&time_from={time_from}T0000&time_to={time_to}T0000&apikey={api_key}'

  r = requests.get(url)

  if r.status_code == 200:
    # Parse JSON response
    data = r.json()

    for item in data['feed']:
      if 'ticker_sentiment' in item:
        for sentiment_item in item['ticker_sentiment']:
          if 'ticker' in sentiment_item and 'relevance_score' in sentiment_item:
            if sentiment_item['ticker'] == 'MSFT':
              relevance_score = float(sentiment_item['relevance_score'])
              if relevance_score > 0.7:
                relevant_data.append(item)
                break  # No need to continue checking ticker_sentiment if MSFT is found

  start_date += timedelta(days=1)

df = pd.DataFrame(relevant_data)

# Save DataFrame to a CSV file
csv_filename = 'relevant_data.csv'
df.to_csv(csv_filename, index=False)
files.download(csv_filename)

In [None]:
# Read the uploaded CSV file into a DataFrame
relevant_data_all_columns = pd.read_csv('relevant_data.csv')
relevant_data_all_columns['ticker_sentiment'] = relevant_data_all_columns['ticker_sentiment'].apply(ast.literal_eval)

In [None]:
msft_avg_sentiment = []

for i in tqdm(range(len(msft_transcripts))):
  transcript = msft_transcripts['Text'].iloc[i]
  sentences = sent_tokenize(transcript)
  # Trim first two and last 2 items from sentence list
  sentences = sentences[2:-2]
  # Create a dataframe to calculate the score for each item in the sentence list.
  sentences_df = pd.DataFrame(sentences, columns=['Text'])
  sentiment_results = sentences_df.apply(get_sentiment, axis=1)
  sentiment_results.columns = ['Sentence', 'Label', 'Score']
  # Remove neutral scores
  sentiment_results = sentiment_results[sentiment_results['Label'] != 'neutral']
  # Remove sentences with low sentiment
    rows_to_drop = int(len(sentiment_results) * 0.3)
  sentiment_results = sentiment_results.sort_values(by = 'Score', ascending = True).iloc[rows_to_drop:]
  # Convert negative sentiment scores
  sentiment_results['Score'] = sentiment_results.apply(lambda row: -row['Score'] if row['Label'] == 'negative' else row['Score'], axis=1)
  # Calculate average sentiment score for the call
  avg_sentiment = sum(sentiment_results['Score'])/len(sentiment_results)
  msft_avg_sentiment.append(avg_sentiment)

msft_transcripts['Sentiment'] = msft_avg_sentiment

In [None]:
# Save DataFrame to a CSV file
csv_filename = 'msft_transcripts_with_sentiment.csv'
msft_transcripts.to_csv(csv_filename, index=False)

# Download the CSV file to your local machine
files.download(csv_filename)

Upload a CSV file
uploaded = files.upload()

# Check the uploaded files
for filename in uploaded.keys():
    print(f'Uploaded file: {filename}')

In [None]:
transcripts = pd.read_csv('msft_transcripts_with_sentiment.csv')
transcripts = pd.DataFrame(transcripts[['Date','Sentiment']]).drop_duplicates().set_index('Date').reindex(msft_df.index).fillna(0)

In [None]:
transcripts.tail()

Unnamed: 0_level_0,Sentiment
Date,Unnamed: 1_level_1
2024-02-28,0.0
2024-02-29,0.0
2024-03-01,0.0
2024-03-04,0.0
2024-03-05,0.0


## Congress Trades

In [None]:
congress = (pd.read_csv('msft_congress.csv')
              .drop(columns=['Range', 'House', 'Party', 'last_modified', 'Ticker', 'ReportDate', 'Unnamed: 0', 'Representative'])
              .replace({'Transaction': {'Sale (Partial)': 'Sale', 'Sale (Full)': 'Sale'}})
              .rename(columns={'TransactionDate': 'Date'})
              .assign(Date=lambda df: pd.to_datetime(df['Date'])))

congress['congress_net_trade'] = np.where(congress['Transaction'] == 'Purchase',
                                          congress['Amount'],
                                          -congress['Amount'])

congress = (congress.drop(columns=['Transaction', 'Amount'])
                             .groupby('Date')['congress_net_trade']
                             .sum())
congress = pd.DataFrame(congress)

In [None]:
congress = congress.reindex(msft_df.index).fillna(0)

Is there an increase in the price of MSFT in the days following a purchase transaction?

In [None]:
msft_congress = pd.read_csv('msft_congress.csv')
msft_congress_df = pd.DataFrame(msft_congress)

# MSFT Purchase Transactions
msft_purchase_transaction = msft_congress_df[msft_congress_df['Transaction'] == 'Purchase'].copy()
msft_purchase_transaction = msft_purchase_transaction.drop(columns = ['Representative','Transaction', 'Unnamed: 0'])
msft_purchase_transaction = msft_purchase_transaction.sort_values(by='TransactionDate', ascending=True)
msft_purchase_transaction['Amount'] = msft_purchase_transaction['Amount'].astype(float)
two_lowest_values = msft_purchase_transaction['Amount'].nsmallest(2).unique()
msft_purchase_transaction = msft_purchase_transaction[~msft_purchase_transaction['Amount'].isin(two_lowest_values)]
msft_purchase_transaction['TransactionDate'] = pd.to_datetime(msft_purchase_transaction['TransactionDate'])
msft_purchase_transaction = msft_purchase_transaction.sort_values(by = 'TransactionDate')
msft_purchase_transaction.head()

Unnamed: 0,ReportDate,TransactionDate,Ticker,Range,House,Amount,Party,last_modified
780,8/29/2014,2013-07-19,MSFT,"$15,001 - $50,000",Representatives,15001.0,R,11/16/2023
779,8/29/2014,2013-09-15,MSFT,"$15,001 - $50,000",Representatives,15001.0,R,11/16/2023
761,8/12/2014,2014-07-08,MSFT,"$15,001 - $50,000",Representatives,15001.0,R,11/16/2023
742,7/28/2015,2015-06-23,MSFT,"$15,001 - $50,000",Representatives,15001.0,D,11/16/2023
741,7/28/2015,2015-06-26,MSFT,"$50,001 - $100,000",Representatives,50001.0,D,11/16/2023


In [None]:
msft_mean_return = []
msft_post_transaction_return = []

for i in range(250): #250 trading days in one year

  # mean return for N number of days
  msft_mean_return.append(mean_return_over_n_days(msft_df,i))

  # return for N number of days following a congress purchase
  n_day_shifted_return = shifted_returns_df(msft_df,i)
  merged_df = pd.merge_asof(msft_purchase_transaction, n_day_shifted_return.reset_index(), left_on='TransactionDate', right_on='Date', direction='forward')
  msft_post_transaction_return.append(round(np.mean(merged_df['Shifted Return Over N Days']),4))

In [None]:
msft_comparison = pd.DataFrame({
    'Number of Trading Days': range(1, 251),
    'mean_return': msft_mean_return,
    'post_transaction_return': msft_post_transaction_return,
})
msft_comparison['average_excess_return'] = msft_comparison['post_transaction_return'] - msft_comparison['mean_return']

In [None]:
fig = px.line(msft_comparison, x='Number of Trading Days', y='average_excess_return', color_discrete_sequence=['#00274C'],
              title='Average Excess Return Over Number of Trading Days')

fig.update_xaxes(title_text='Number of Trading Days Post Purchase')
fig.update_yaxes(title_text='Average Excess Return')
fig.add_hline(y=0, line_color='#FFCB05')

fig.show()

The chart above shows the difference in return for MSFT stock over the days after a Congress member makes a purchase compared to the return over a "normal" time period of the same length. Around 63 day post-trade period, we can see this excess return coming in to effect. This can potentially be seen as a leading indicator that on average MSFT, once a Congress member has purchased MSFT stock, we will see higher returns than average over the 2 mont to 1 year period post-purchase.

In [None]:
msft_df = msft_df.drop(columns = 'Return Over N Days')

## Ratings

Analyst rating of MSFT. Data obtained from Financial Modeling Prep

In [None]:
ratings = pd.read_csv('msft_ratings.csv').assign(Date=lambda df: pd.to_datetime(df['Date'])).set_index('Date')
ratings = ratings.drop(columns = ['ratingRecommendation', 'ratingDetailsDCFRecommendation', 'ratingDetailsROERecommendation', 'ratingDetailsROARecommendation', 'ratingDetailsDERecommendation', 'ratingDetailsPERecommendation', 'ratingDetailsPBRecommendation'])

In [None]:
ratings = ratings.reindex(msft_df.index).fillna(method='ffill')

## Social Media Sentiment

counts from Nov 18, 2021 to Jan 21, 2024 <br> Sentiment scores from Feb 20, 2022 to Jan 21, 2024 for StockTwits and Apr 4, 2023 for Twitter

Data obtained from Financial Modeling Prep

In [None]:
def fetch_social_media_data(url, api_key):
  all_data = []
  page = 0
  continue_fetching = True

  while continue_fetching:
    paginated_url = f"{url}&page={page}"
    response = requests.get(paginated_url.format(api_key=api_key))
    data = response.json()

    # Convert the current page of data to a DataFrame
    df = pd.DataFrame(data)

    if not df.empty:
      # Check the last value in the 'date' column
      last_date = pd.to_datetime(df['date'].iloc[-1])

      # If the last date is on or after January 1, 2020, add the data and pull the next page
      if last_date >= pd.Timestamp('2020-01-01'):
        all_data.append(df)
      # If the last date is before December 31, 2023, stop fetching
      elif last_date < pd.Timestamp('2020-01-01'):
        continue_fetching = False

      page += 1
    else:
      break

  # Concatenate all the pages of data into a single DataFrame
  if all_data:
    social_sentiment = pd.concat(all_data, ignore_index=True)
    # Filter the DataFrame for dates
    social_sentiment['date'] = pd.to_datetime(social_sentiment['date'])
    social_sentiment = social_sentiment[social_sentiment['date'].dt.year > 2019]
    return social_sentiment
  else:
    return pd.DataFrame()

In [None]:
api_key = ''
base_url = 'https://financialmodelingprep.com/api/v4/historical/social-sentiment?symbol=MSFT&apikey={api_key}'
social_sentiment = fetch_social_media_data(base_url, api_key)

In [None]:
social_sentiment = pd.read_csv('msft_social_sentiment.csv')
social_sentiment = pd.DataFrame(social_sentiment)
social_sentiment['Date'] = pd.to_datetime(social_sentiment['Date'])
social_sentiment['Date'] = social_sentiment['Date'].dt.strftime('%Y-%m-%d')

In [None]:
columns_to_convert = ['stocktwitsSentiment', 'twitterSentiment']
for col in columns_to_convert:
  social_sentiment[col] = pd.to_numeric(social_sentiment[col], errors='coerce')
  social_sentiment[col] = social_sentiment[col].fillna(0)

aggregations = {
    'stocktwitsPosts': 'sum', 'twitterPosts': 'sum',
    'stocktwitsComments': 'sum', 'twitterComments': 'sum',
    'stocktwitsLikes': 'sum', 'twitterLikes': 'sum',
    'stocktwitsImpressions': 'sum', 'twitterImpressions': 'sum',
    'stocktwitsSentiment': 'mean', 'twitterSentiment': 'mean'
}

# Group by 'Date', then apply the aggregation operations
social_sentiment = social_sentiment.groupby('Date').agg(aggregations).reset_index()

In [None]:
social_sentiment = social_sentiment.set_index('Date')
social_sentiment.index = pd.to_datetime(social_sentiment.index)
social_sentiment = social_sentiment.reindex(msft_df.index).fillna(value = 0)

## Reddit

Reddit data available from March 9, 2022. <br>

Data was downloaded from the subreddits r/wallstreetbets and r/stocks for the queries "MSFT" and "Microsoft" using the following 4 cells repeated 4 times.

In [None]:
''' The code for the 'Reddit' section in this cell and the following 2 cells were taken from the following source:
Lang, Erik. "Reddit API Lab - Create." SIADS682 Social Media Analytics, submitted 12 March, 2024, University of Michigan. Unpublished course assignment.'''
REDDIT_USERNAME = hidden_credentials.reddit_keys()['REDDIT_USERNAME']
REDDIT_PASSWORD = hidden_credentials.reddit_keys()['REDDIT_PASSWORD']
APP_ID = hidden_credentials.reddit_keys()['APP_ID']
APP_SECRET = hidden_credentials.reddit_keys()['APP_SECRET']
APP_NAME = hidden_credentials.reddit_keys()['APP_NAME']

In [None]:
reddit = praw.Reddit(
    client_id=APP_ID,
    client_secret=APP_SECRET,
    user_agent=APP_NAME,
    username=REDDIT_USERNAME,
    password=REDDIT_PASSWORD,
    check_for_async=False # This additional parameter supresses a warning about "Asynchronous PRAW"
)

In [None]:
subreddit = reddit.subreddit("wallstreetbets")
print("r/"+ subreddit.display_name)
print("-------")
print("title: "+ subreddit.title)
print("-------")
print("id:" + subreddit.id)
print("-------")
print("number of subscribers:" + str(subreddit.subscribers))
print("-------")
# print(subreddit.description)

In [None]:
# Download data from 'MSFT' query from Walstreetbets subreddit
msft_df = pd.DataFrame(columns=["title", "text", "id", "created_utc", "num_comments", "score"])
start_date = datetime(2022, 3, 6)
end_date = datetime(2024, 3, 6)
msft_df = fetch_and_append_reddit_data(start_date, end_date, reddit, msft_df,'wallstreetbets', 'MSFT')
# Save DataFrame to a CSV file
csv_filename = 'reddit_wallstreetbets_msft_data.csv'
msft_df.to_csv(csv_filename)

# Download the CSV file
files.download(csv_filename)

In [None]:
wallstreetbets_msft = pd.read_csv('reddit_wallstreetbets_msft_data.csv')
wallstreetbets_msft = pd.DataFrame(wallstreetbets_msft)
wallstreetbets_microsoft = pd.read_csv('reddit_wallstreetbets_microsoft_data.csv')
wallstreetbets_microsoft = pd.DataFrame(wallstreetbets_microsoft)
stocks_msft = pd.read_csv('reddit_stocks_msft_data.csv')
stocks_msft = pd.DataFrame(stocks_msft)
stocks_microsoft = pd.read_csv('reddit_stocks_microsoft_data.csv')
stocks_microsoft = pd.DataFrame(stocks_microsoft)

In [None]:
reddit = pd.concat([
    wallstreetbets_msft[['title', 'text', 'created_utc']],
    wallstreetbets_microsoft[['title', 'text', 'created_utc']],
    stocks_msft[['title', 'text', 'created_utc']],
    stocks_microsoft[['title', 'text', 'created_utc']]
], ignore_index=True)

In [None]:
# Filter out comments
reddit =reddit[~reddit['title'].str.startswith('Comment on')].copy()
reddit['created_utc'] = pd.to_datetime(reddit['created_utc']).dt.date
reddit = reddit.rename(columns={'created_utc': 'Date'})
reddit['full_text'] = reddit.apply(lambda row: row['title'] + '. ' + row['text'] if pd.notna(row['text']) else row['title'], axis=1)
reddit = reddit.drop(['title','text'], axis=1)

reddit = reddit.rename(columns={'full_text': 'Text'})
reddit = reddit.sort_values(by='Date', ascending=True)

In [None]:
# # Apply the function to the dataframe and create new columns for 'Label' and 'Score'
# reddit[['Label', 'Score']] = reddit.apply(get_sentiment, axis=1)

reddit_scores = pd.read_csv('reddit_scores.csv')
reddit_scores = pd.DataFrame(reddit_scores).drop(columns=['Unnamed: 0'])

In [None]:
# Calculate a sentiment score that combines the label and the score

def combine_label_score(label, score):
    if label == 'positive':
        return score  # Positive score as is
    elif label == 'negative':
        return -score  # Negative score as a negative value
    else:
        return score/10 # Neutral sentiment divide score by 10

reddit_scores['redditSentiment'] = reddit_scores.apply(lambda row: combine_label_score(row['Label'], row['Score']), axis=1)
reddit_scores = reddit_scores.groupby('Date')['redditSentiment'].mean().reset_index()
reddit_scores = reddit_scores.set_index('Date')
reddit_scores = reddit_scores.reindex(msft_df.index).fillna(value = 0)

## News Sentiment

News data starting from August 16, 2022.

Data obtained from Financial Modeling Prep

In [None]:
# function to download news data from Financial Modeling Prep

def fetch_news_data(api_key):
  all_news_data = []
  page = 0

  while True:
      url = f'https://financialmodelingprep.com/api/v4/stock-news-sentiments-rss-feed?page={page}&apikey={api_key}'
      response = requests.get(url)
      data = response.json()

      if not data:
        break  # Stop if there's no data

      df = pd.DataFrame(data)

      if 'symbol' not in df.columns:
        break  # Stop if the expected column is not in the dataframe

      msft_news = df[df['symbol'] == 'MSFT']

      if not msft_news.empty:
        msft_news['publishedDate'] = pd.to_datetime(msft_news['publishedDate'], utc=True)
        all_news_data.append(msft_news)

      page += 1

  if all_news_data:
    all_news_data = pd.concat(all_news_data, ignore_index=True)
    # Filter for articles after 2019 after all data is collected
    all_news_data = all_news_data[(all_news_data['publishedDate'] >= '2020-01-01') &
                                  (all_news_data['publishedDate'] < '2024-03-06')]
    return all_news_data
  else:
    return pd.DataFrame()

In [None]:
news = (pd.read_csv('msft_news_data.csv')
        .drop(columns=['title', 'site', 'text', 'sentiment'])
        .rename(columns={'sentimentScore': 'news_sentimentScore'}))

news['Date'] = pd.to_datetime(news['Date']).dt.strftime('%Y-%m-%d')

news = news.groupby('Date')['news_sentimentScore'].mean().reset_index()
news = news.set_index('Date')
news.index = pd.to_datetime(news.index)
news = news.reindex(msft_df.index).fillna(value = 0)

## AlphaVantage News

More news articles about MSFT from different sources.

Data obtained from AlphaVantage using an API key.

In [None]:
# Download data from AlphaVantage and save to csv file
relevant_data = []

# Define start and end dates as datetime objects
start_date = datetime(2022, 3, 1)
end_date = datetime(2024, 3, 6)

# Define API key
api_key = 'api_key'

# Loop through date range with 1-day increments
while start_date <= end_date:
  # Convert start and end dates to string format
  time_from = start_date.strftime('%Y%m%d')
  time_to = (start_date + timedelta(days=1)).strftime('%Y%m%d')

  # Construct API URL
  url = f'https://www.alphavantage.co/query?function=NEWS_SENTIMENT&tickers=MSFT&time_from={time_from}T0000&time_to={time_to}T0000&apikey={api_key}'

  # Make API call
  r = requests.get(url)

  # Check if API call was successful
  if r.status_code == 200:
    # Parse JSON response
    data = r.json()

    # Check for relevant items in the retrieved data and add them to relevant_data
    for item in data['feed']:
      if 'ticker_sentiment' in item:
        for sentiment_item in item['ticker_sentiment']:
          if 'ticker' in sentiment_item and 'relevance_score' in sentiment_item:
            if sentiment_item['ticker'] == 'MSFT':
              relevance_score = float(sentiment_item['relevance_score'])
              if relevance_score > 0.7:
                relevant_data.append(item)
                break  # No need to continue checking ticker_sentiment if MSFT is found

  # Increment start date by 1 day for the next iteration
  start_date += timedelta(days=1)

# Convert relevant_data to a DataFrame (assuming relevant_data is a list of dictionaries)
df = pd.DataFrame(relevant_data)

# Save DataFrame to a CSV file
csv_filename = 'relevant_data.csv'
df.to_csv(csv_filename, index=False)

# Download the CSV file to your local machine
files.download(csv_filename)

In [None]:
relevant_data_all_columns = pd.read_csv('av_news_data.csv')
relevant_data_all_columns['ticker_sentiment'] = relevant_data_all_columns['ticker_sentiment'].apply(ast.literal_eval)

In [None]:
relevant_data = pd.DataFrame()
relevant_data['Date'] = relevant_data_all_columns['time_published']
# Trim the contents of the 'Date' column to keep only the first 8 characters
relevant_data['Date'] = relevant_data['Date'].str[:8]
# Convert 'Date' column to datetime type with the format YYYY-MM-DD
relevant_data['Date'] = pd.to_datetime(relevant_data['Date'], format='%Y%m%d')

In [None]:
# Initialize an empty list to store the sentiment scores
sentiment_scores = []

# Iterate over the 'ticker_sentiment' column
for sentiment_list in relevant_data_all_columns['ticker_sentiment']:
  msft_sentiment_score = None

  # Iterate over the list of dictionaries in the 'ticker_sentiment' column
  for sentiment_dict in sentiment_list:
    # Check if the 'ticker' value starts with 'MSFT'
    if 'ticker' in sentiment_dict and sentiment_dict['ticker'].startswith('MSFT'):
      # Get the 'ticker_sentiment_score' value from the dictionary
      msft_sentiment_score = sentiment_dict.get('ticker_sentiment_score')
      break

  # Append the sentiment score to the list
  sentiment_scores.append(msft_sentiment_score)

# Convert list of strings to list of floats using map() function
sentiment_scores_float = list(map(float, sentiment_scores))

# Add the 'AV Sentiment Score' column to the relevant_data DataFrame
relevant_data['AV Sentiment Score'] = sentiment_scores_float

In [None]:
relevant_data['text'] = relevant_data_all_columns['title'] + '. ' + relevant_data_all_columns['summary']

def preprocess_text(text):

  return text.lower().strip()

relevant_data['text'] = relevant_data['text'].apply(preprocess_text)

In [None]:
# # Use the finBERT model to predict sentiment for each text
sentiment_results = finbert_sentiment(relevant_data['text'].tolist())

# # Extract sentiment labels and scores (adjust based on model output format)
relevant_data['sentiment'] = [result['label'] for result in sentiment_results]
relevant_data['finBERT sentiment_score'] = [result['score'] for result in sentiment_results]

In [None]:
converted_scores_with_text = pd.read_csv('converted_scores.csv')
# Convert 'Date' column to DateTime
converted_scores_with_text['Date'] = pd.to_datetime(converted_scores_with_text['Date'])
# Set 'Date' column as index
converted_scores_with_text.set_index('Date', inplace=True)
converted_scores = converted_scores_with_text.drop(columns=['text','sentiment','finBERT sentiment_score'])
converted_scores = converted_scores.rename(columns={'converted_score': 'finbert_news_score'})
av_news = converted_scores
av_news = av_news.groupby('Date').mean()
av_news = av_news.reindex(msft_df.index).fillna(value = 0)

## Dividends

Dividend declarartion and payment data for MSFT.

Data obtained from Financial Modeling Prep

In [None]:
url = f'https://financialmodelingprep.com/api/v3/historical-price-full/stock_dividend/MSFT?&apikey={api_key}'
response = requests.get(url)
data = response.json()
df = pd.DataFrame(data['historical'])
# Save DataFrame to a CSV file
csv_filename = 'msft_dividends.csv'
df.to_csv(csv_filename, index=True)

# Download the CSV file
files.download(csv_filename)

In [None]:
df = pd.read_csv('msft_dividends.csv')

In [None]:
div_payment = (df[['adjDividend', 'paymentDate']]
               .rename(columns={'adjDividend': 'Dividend', 'paymentDate': 'Date'})
               .assign(Date=lambda x: pd.to_datetime(x['Date']))
               .set_index('Date')
               .reindex(msft_df.index, fill_value=0))

In [None]:
div_declar = (df[['adjDividend', 'declarationDate']]
              .drop_duplicates('declarationDate')
               .rename(columns={'adjDividend': 'Dividend', 'declarationDate': 'Date'})
               .assign(Date=lambda x: pd.to_datetime(x['Date']))
               .set_index('Date')
               .reindex(msft_df.index, fill_value=0))

In [None]:
div_declar_filtered = div_declar[(div_declar.index.isin(msft_df.index)) & (div_declar['Dividend'] != 0)]

fig = go.Figure()
fig.add_trace(go.Scatter(x=msft_df.index, y=msft_df['Adj Close'], mode='lines',
                         line=dict(color='#00274C'),
                         name='Adj Closing Price'))

# Add vertical lines for dividend declaration dates
for date in div_declar_filtered.index:
    fig.add_vline(x=date, line_width=1, line_color='#FFCB05')

# Update layout with titles and axis labels
fig.update_layout(
    title="MSFT Adjusted Closing Price with Dividend Declarations",
    xaxis_title="Date",
    yaxis_title="Adjusted Close Price",
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False)

)

fig.show()

## Ratios

Key ratios for MSFT

Data obtained from Financial Modeling Prep

In [None]:
url = f'https://financialmodelingprep.com/api/v3/ratios/MSFT?period=quarter&apikey={api_key}'
response = requests.get(url)
data = response.json()
df = pd.DataFrame(data)
# Save DataFrame to a CSV file
csv_filename = 'msft_ratios.csv'
df.to_csv(csv_filename, index=True)

# Download the CSV file
files.download(csv_filename)

In [None]:
ratios = pd.read_csv('msft_ratios.csv')
ratios = pd.DataFrame(ratios)
ratios['Date'] = pd.to_datetime(ratios['Date'])

ratios = ratios.set_index('Date')
first_row = ratios.iloc[0]
first_row = first_row.to_frame().T
ratios = ratios.reindex(msft_df.index)
ratios = pd.concat([first_row, ratios], axis=0)
ratios = ratios.interpolate(method='linear', axis=0)
ratios = ratios.iloc[1:]

## Key Metrics

Key metrics for MSFT.

Data obtained from Financial Modeling Prep

In [None]:
url = f'https://financialmodelingprep.com/api/v3/key-metrics/MSFT?period=quarter&apikey={api_key}'
response = requests.get(url)
data = response.json()
df = pd.DataFrame(data)

# Save DataFrame to a CSV file
csv_filename = 'msft_metrics.csv'
df.to_csv(csv_filename, index=True)

# Download the CSV file
files.download(csv_filename)

In [None]:
metrics = pd.read_csv('msft_metrics.csv')
metrics = pd.DataFrame(metrics)
metrics['Date'] = pd.to_datetime(metrics['Date'])

metrics = metrics.set_index('Date')
metrics = metrics.drop(columns = ['marketCap','netIncomePerShare'])
first_row = metrics.iloc[0]
first_row = first_row.to_frame().T
metrics = metrics.reindex(msft_df.index)
metrics = pd.concat([first_row, metrics], axis=0)
metrics = metrics.interpolate(method='linear', axis=0)
metrics = metrics.iloc[1:]

# Master Dataframe

Merge features to one master dataframe

In [None]:
# List of all DataFrames to be merged
dfs = [msft_df, vix_df, implied_volatility, grade_df, recommendation_score,
       volume, daily_is_complete, shares_ratios, dbs, daily_cf, surprise,
       transcripts, congress, ratings, social_sentiment, news, div_payment,
       div_declar, ratios, metrics, reddit_scores, av_news]

# Use reduce to apply pd.merge in a chain across all DataFrames in the list
master_df = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True, how='inner'), dfs)

master_df['Shifted Adj Close'] = master_df['Adj Close'].shift(-1)

last_features_row = master_df.drop(columns=['Adj Close','Shifted Adj Close']).iloc[-1].values.reshape(1, -1)

master_df = master_df.dropna()
master_df.head()

Unnamed: 0_level_0,Adj Close,Return Over N Days,^VIX9D,^VIX,^VIX3M,^VIX6M,hv10,hv20,hv30,hv60,...,averageInventory,daysSalesOutstanding,daysPayablesOutstanding,daysOfInventoryOnHand,roe,capexPerShare,redditSentiment,AV Sentiment Score,finbert_news_score,Shifted Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-12-28,218.672104,0.41541,20.05,21.7,25.15,27.16,0.161,0.1858,0.1603,0.2381,...,2313820000.0,56.983607,81.971939,12.664909,0.118443,-0.556985,0.0,0.0,0.0,217.88472
2020-12-29,217.88472,0.428096,21.47,23.08,26.24,28.09,0.1663,0.1869,0.161,0.2379,...,2314047000.0,57.010336,81.638237,12.509779,0.118539,-0.555484,0.0,0.0,0.0,215.483765
2020-12-30,215.483765,0.408718,21.31,22.77,25.44,27.5,0.1837,0.1697,0.1654,0.2379,...,2314273000.0,57.037065,81.304535,12.35465,0.118635,-0.553983,0.0,0.0,0.0,216.203079
2020-12-31,216.203079,0.426426,21.2,22.75,25.28,27.29,0.1469,0.1643,0.1628,0.2378,...,2314500000.0,57.063794,80.970833,12.199521,0.118731,-0.552482,0.0,0.0,0.0,211.605316
2021-01-04,211.605316,0.374203,27.39,26.97,28.26,29.46,0.1861,0.1906,0.1804,0.2406,...,2310730000.0,57.059501,81.16036,12.253442,0.118668,-0.554491,0.0,0.0,0.0,211.809448


# Simple Baseline Model

According to financial theory, stock prices follow a random walk, meaning that the stock price for tomorrow equals today's price plus a random error term. This following model attempts to replicate that model and will be used as a baseline model against which to compare the other models in this project.

In [None]:
def simple_price_with_drift(start_price, end_price, iterations=1000, drift=0.001):
  drift = np.mean(msft_df['Adj Close'].pct_change().dropna()) # historic average daily return for MSFT
  new_prices = []
  rmses = []

  for _ in range(iterations):

    error_term = np.random.randn()
    new_price = start_price + start_price * drift + error_term

    # Calculate and store new price and RMSE
    rmse = np.sqrt(mean_squared_error([end_price], [new_price]))
    new_prices.append(new_price)
    rmses.append(rmse)

  # Calculate the average new price and average RMSE across all iterations
  average_new_price = round(np.mean(new_prices), 4)
  average_rmse = round(np.mean(rmses), 4)

  return average_new_price, average_rmse

In [None]:
rmse_full_timeframe = []
predicted_simple_price_full_timeframe = []
drift = np.mean(msft_df['Adj Close'].pct_change().dropna())
# Calculate for msft_df prices
for i in tqdm(range(1, len(msft_df))):
  start_price = msft_df['Adj Close'].iloc[i-1]
  end_price = msft_df['Adj Close'].iloc[i]
  new_price, average_rmse = simple_price_with_drift(start_price, end_price, 1000, drift)
  rmse_full_timeframe.append(average_rmse)
  predicted_simple_price_full_timeframe.append(new_price)

# Calculate the average RMSE across the entire timeframe
average_rmse_over_timeframe = np.mean(rmse_full_timeframe)

  0%|          | 0/1049 [00:00<?, ?it/s]

In [None]:
average_rmse_over_timeframe

3.6585227836034315

In [None]:
# simple price forecast for last day
simple_prediction = simple_price_with_drift(414.92, 402.65,1000)

In [None]:
msft_df_dropped_first = msft_df.iloc[1:].copy()
simple_model_df = msft_df_dropped_first
simple_model_df['Predicted Price'] = predicted_simple_price_full_timeframe

simple_model_df = simple_model_df.set_index(msft_df_dropped_first.index)

# Display the resulting DataFrame
simple_model_df.head()

Unnamed: 0_level_0,Adj Close,Predicted Price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-03,152.570129,154.6852
2020-01-06,152.964462,152.6698
2020-01-07,151.569794,153.1101
2020-01-08,153.984055,151.742
2020-01-09,155.907791,154.1471


In [None]:
simple_model_df_last_250 = simple_model_df.iloc[-250:]

# Create a line plot for 'Adj Close'
fig = go.Figure()
fig.add_trace(go.Scatter(x=simple_model_df_last_250.index, y=simple_model_df_last_250['Adj Close'], mode='lines',
                         name='Adj Close', line=dict(color='#FFCB05')))

# Add a line plot for 'Predicted Price'
fig.add_trace(go.Scatter(x=simple_model_df_last_250.index, y=simple_model_df_last_250['Predicted Price'], mode='lines',
                         name='Predicted Price', line=dict(color='#00274C')))

# Update the layout
fig.update_layout(
    title='MSFT Actual Price vs Simple Predicted Price',
    xaxis_title='Date',
    yaxis_title='Price',
    legend_title='Price Type',
    template='plotly_white'
)

fig.show()

# Random Forest Model (All Features)

## Basic Random Forest Model

In [None]:
X = master_df.drop(columns=['Adj Close','Shifted Adj Close'])
y = master_df['Shifted Adj Close']

# Initialize TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Placeholder list to store RMSE for each fold
rmse_scores = []

# Placeholder for feature importances
feature_importances = None

for train_index, test_index in tqdm(tscv.split(X), total=tscv.n_splits):
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

  model = RandomForestRegressor(n_estimators=10000, random_state=42)
  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  # Compute and store RMSE for the current fold
  rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))

  # Update feature importances - using the last split here for simplicity
  feature_importances = pd.Series(model.feature_importances_, index=X.columns)

# Average RMSE across folds
average_rmse = np.mean(rmse_scores)
print(f"Average RMSE: {average_rmse}")

100%|██████████| 5/5 [35:53<00:00, 430.64s/it]

Average RMSE: 35.334841778900966





In [None]:
sorted_feature_importances = feature_importances.sort_values(ascending=False)
top_ten_features = sorted_feature_importances[0:10]
top_ten_features

epsdiluted                           0.141138
netIncome                            0.140621
eps                                  0.135894
enterpriseValue                      0.087658
cashAtBeginningOfPeriod              0.043145
dividendPaidAndCapexCoverageRatio    0.011736
investedCapital                      0.009860
totalStockholdersEquity              0.009509
longTermDebtToCapitalization         0.009426
totalAssets                          0.009287
dtype: float64

In [None]:
next_day_prediction = model.predict(last_features_row)

print(f"Next day predicted 'Adj Close': {next_day_prediction[0]}")



Next day predicted 'Adj Close': 328.03809423217774


In [None]:
# save output

# RMSE score
rmse_df_all_basic = pd.DataFrame({'RMSE Scores': [average_rmse]})
rmse_df_all_basic.to_csv('results_all_basic_rmse_scores.csv', index=False)
files.download('results_all_basic_rmse_scores.csv')

# Feature importances
sorted_feature_importances.to_csv('results_all_basic_sorted_feature_importances.csv', index=True)
files.download('results_all_basic_sorted_feature_importances.csv')

# Price prediction
predictions_df = pd.DataFrame(next_day_prediction, columns=['Prediction'])
predictions_df.to_csv('results_all_basic_random_forest_prediction.csv', index=True)
files.download('results_all_basic_random_forest_prediction.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
rmse_df_all_basic = pd.read_csv('results_all_basic_rmse_scores.csv')
rmse_df_all_basic = pd.DataFrame(rmse_df_all_basic)

## Train Model with Top Features Obtained from Basic Model

In [None]:
def train_with_top_features_and_predict(X, y, top_features):
  """
  Trains models with an increasing number of top features and calculates RMSE for each model.

  Parameters:
  - X: DataFrame containing all features.
  - y: Series containing the target variable.
  - top_features: Series with sorted feature importances.

  Returns:
  - List of RMSE scores for models trained with incremental top features.
  """
  rmse_scores_by_feature_count = []
  final_model = None
  next_day_prediction = None

  # Initialize TimeSeriesSplit
  tscv = TimeSeriesSplit(n_splits=5)

  for i in tqdm(range(1, len(top_features) + 1)):
    # Select the top i features
    top_i_features = top_features.index[:i]
    X_top_i = X[top_i_features]

    rmse_scores = []  # Store RMSE for each fold

    for train_index, test_index in tscv.split(X_top_i):
      X_train, X_test = X_top_i.iloc[train_index], X_top_i.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]

      # Train the model
      model = RandomForestRegressor(n_estimators=7000, random_state=42)
      model.fit(X_train, y_train)

      y_pred = model.predict(X_test)

      # Compute and store RMSE for the current fold
      rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))

    # Average RMSE across folds for the current number of top features
    average_rmse = np.mean(rmse_scores)
    rmse_scores_by_feature_count.append(average_rmse)

    # Keep the last model and its features for next day prediction
    if i == len(top_features):
      final_model = model
      final_features = top_i_features

  # Train the final model on the latest data using selected top features
  X_train_final = X[final_features]
  y_train_final = y
  final_model.fit(X_train_final, y_train_final)

  # Prepare the last row from the dataset for prediction
  last_features_row = X[final_features].iloc[-1:].values
  next_day_prediction = final_model.predict(last_features_row)

  return rmse_scores_by_feature_count, next_day_prediction[0], final_model

In [None]:
top_ten_features = pd.read_csv('results_all_basic_sorted_feature_importances.csv')
top_ten_features = pd.DataFrame(top_ten_features).set_index('Unnamed: 0')
top_ten_features = top_ten_features.iloc[:10]

In [None]:
df = master_df
X = df.drop(columns=['Adj Close', 'Shifted Adj Close'])
y = df['Shifted Adj Close']
# top_ten_features = sorted_feature_importances[:10]

rmse_scores_list, next_day_pred, model = train_with_top_features_and_predict(X, y, top_ten_features)

print("RMSE scores by number of features used:", rmse_scores_list)
print("Predicted price for the next day:", next_day_pred)

In [None]:
num_top_features = list(range(1, len(top_ten_features) + 1))

# Create a DataFrame for RMSE scores
rmse_df_all_train_top_features = pd.DataFrame({
    'Num_Top_Features': num_top_features,
    'RMSE_Scores': rmse_scores_list
})

# Add the next_day_pred as a new column (repeating the value for all rows for demonstration)
rmse_df_all_train_top_features['Next_Day_Prediction based on final model'] = next_day_pred

rmse_df_all_train_top_features.to_csv('results_all_train_top_features_rmse_and_prediction.csv', index=False)
files.download('results_all_train_top_features_rmse_and_prediction.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
rmse_df_all_train_top_features = pd.read_csv('results_all_train_top_features_rmse_and_prediction.csv')
rmse_df_all_train_top_features = pd.DataFrame(rmse_df_all_train_top_features)

## Random Forest Model with Grid Search CV to get Top Features

In [None]:
# Model with GridSearchCV
X = master_df.drop(columns=['Adj Close','Shifted Adj Close'])
y = master_df['Shifted Adj Close']

# Define the parameter grid to search
param_grid = {
    'n_estimators': [1500, 3000, 6000],  # List of numbers of trees
    # 'max_depth': [None, 30, 120],  # Maximum depth of trees, including 'None' for full growth
    'min_samples_split': [2, 12, 20], # Minimum number of samples required to split a node
    # 'min_samples_leaf': [1, 3, 8],    # Minimum number of samples required at each leaf node
    # 'max_features': ['auto', 'sqrt', 40]  # Number of features to consider at every split
}

# Initialize the model
model = RandomForestRegressor(random_state=42)

tscv = TimeSeriesSplit(n_splits=5)

# Define RMSE as the scoring criteria
rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

# GridSearchCV with TimeSeriesSplit cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=tscv, scoring=rmse_scorer, verbose=1, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X, y)

# Access the best estimator
best_model = grid_search.best_estimator_

# Feature importances from the best model
feature_importances = pd.Series(best_model.feature_importances_, index=X.columns)

# Sort feature importances in descending order and select top 10
sorted_feature_importances = feature_importances.sort_values(ascending=False)
top_ten_features = sorted_feature_importances[:10]

last_features_row = X.iloc[-1].values.reshape(1, -1)

next_day_prediction = best_model.predict(last_features_row)

# Print the results
print("Best parameters:", grid_search.best_params_)
print("Best RMSE score:", -grid_search.best_score_)
print(f"Top 10 features:\n{top_ten_features}")
print("Next day prediction:", next_day_prediction[0])

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters: {'min_samples_split': 12, 'n_estimators': 3000}
Best RMSE score: 35.23601112757803
Top 10 features:
enterpriseValue                       0.646652
^VIX6M                                0.046765
^VIX3M                                0.023085
capitalExpenditureCoverageRatio       0.015604
netReceivables                        0.008085
freeCashFlowOperatingCashFlowRatio    0.006726
capexToOperatingCashFlow              0.006628
hv150                                 0.005411
hv120                                 0.005251
totalEquity                           0.005165
dtype: float64
Next day prediction: 405.9340169863772




In [None]:
# RMSE score
rmse_scores = -grid_search.best_score_
rmse_df = pd.DataFrame({'RMSE Scores': [rmse_scores]})
rmse_df.to_csv('rmse_scores.csv', index=False)
files.download('rmse_scores.csv')

# Feature importances
sorted_feature_importances.to_csv('sorted_feature_importances.csv', index=True)
files.download('sorted_feature_importances.csv')

# Price prediction
predictions_df = pd.DataFrame(next_day_prediction, columns=['Prediction'])
predictions_df.to_csv('random_forest_prediction.csv', index=True)
files.download('random_forest_prediction.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
results_all_gs_rmse = pd.read_csv('results_all_gs_random_forest_prediction_and_rmse_scores.csv')
results_all_gs_rmse = pd.DataFrame(results_all_gs_rmse)

## Random Forest with Forward Selection

In [None]:
X = master_df.drop(columns=['Adj Close','Shifted Adj Close'])
y = master_df['Shifted Adj Close']

# Split the data into training and testing sets
tscv = TimeSeriesSplit(n_splits=5)

# List to keep track of selected features and performance
selected_features = []
remaining_features = list(X_train.columns)
best_score = float('inf')

while remaining_features and len(selected_features) < 10:
  score_improvement = False
  for feature in remaining_features:
    # Test adding the current feature
    trial_features = selected_features + [feature]
    trial_X = X[trial_features]

    # Cross-validate the model with the current set of features
    rmse_scores = []
    for train_index, test_index in tscv.split(trial_X):
      X_train, X_test = trial_X.iloc[train_index], trial_X.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]

      model = RandomForestRegressor(n_estimators=10000, random_state=42)
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)
      rmse_score = np.sqrt(mean_squared_error(y_test, y_pred))
      rmse_scores.append(rmse_score)

    # Calculate the average RMSE across time-series splits
    average_rmse = np.mean(rmse_scores)

    # Check if the RMSE has improved
    if average_rmse < best_score:
      best_score = average_rmse
      best_feature = feature
      score_improvement = True

  # If the feature improves the model, add it to the selected features
  if score_improvement:
    selected_features.append(best_feature)
    remaining_features.remove(best_feature)
  else:
    # If no improvement, exit the while loop
    break

print("Selected features:", selected_features)
print("Best RMSE Score:", best_score)

# Train the final model on the selected features
# Use the last split as the hold-out test set
train_index, test_index = list(tscv.split(X[selected_features]))[-1]
X_train, X_test = X[selected_features].iloc[train_index], X[selected_features].iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

final_model = RandomForestRegressor(n_estimators=100, random_state=42)
final_model.fit(X_train, y_train)
final_predictions = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, final_predictions))

# Predict the next day (using the last available features)
last_features_row = X[selected_features].iloc[-1].values.reshape(1, -1)
next_day_prediction = final_model.predict(last_features_row)

print("Final RMSE on the hold-out test set:", final_rmse)
print("Next day prediction:", next_day_prediction[0])

Selected features: ['hv60']
Best RMSE Score: 80.74162455339832
Final RMSE on the hold-out test set: 113.43419127620191
Next day prediction: 284.4680347442627




In [None]:
results_df = pd.DataFrame({
    'Selected_Features': selected_features,
    'RMSE': final_rmse,
    'Next_Day_Prediction': next_day_prediction[0]
})
results_all_fs_features_rmse_and_predictions = results_df
results_all_fs_features_rmse_and_predictions.to_csv('results_all_fs_features_rmse_and_predictions.csv', index=False)
files.download('results_all_fs_features_rmse_and_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
results_all_fs_features_rmse_and_predictions = pd.read_csv('results_all_fs_features_rmse_and_predictions.csv')
results_all_fs_features_rmse_and_predictions = pd.DataFrame(results_all_fs_features_rmse_and_predictions)

## Random Forest with Backward Elimination

In [None]:
def rf_backward_elimination(X, y):
  features = X.columns.tolist()
  best_rmse = float('inf')
  best_features = features.copy()
  improvement = True

  # Initialize TimeSeriesSplit
  tscv = TimeSeriesSplit(n_splits=5)

  while improvement and features:
    improvement = False
    feature_rmse = []

    for feature in tqdm(features, desc="Evaluating features"):
      # Indicate which feature is being processed
      print(f"Processing {feature}...")
      trial_features = [f for f in features if f != feature]
      trial_X = X[trial_features]

      # Store RMSE for each fold
      fold_rmse = []

      for train_index, test_index in tscv.split(trial_X):
        X_train, X_test = trial_X.iloc[train_index], trial_X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        fold_rmse.append(rmse)

      # Calculate average RMSE across folds for current feature set
      average_rmse = np.mean(fold_rmse)
      feature_rmse.append((feature, average_rmse))

    # Find the feature whose removal gives the best average RMSE
    feature_rmse.sort(key=lambda x: x[1])
    if feature_rmse and feature_rmse[0][1] < best_rmse:
      best_rmse = feature_rmse[0][1]
      features.remove(feature_rmse[0][0])
      best_features = features.copy()
      improvement = True
      print(f"Removed {feature_rmse[0][0]}, Best RMSE: {best_rmse}")
    else:
      print("No further improvement.")
  final_X = X[best_features]
  final_model = RandomForestRegressor(n_estimators=1000, random_state=42)
  final_model.fit(final_X, y)

  # Make a one-day prediction using the last available data point
  last_features_row = final_X.iloc[-1:].values
  next_day_prediction = final_model.predict(last_features_row)

  return best_features, best_rmse, next_day_prediction[0]

In [None]:
results_df = pd.DataFrame({
    'Best_Features': best_features,
    'RMSE': best_rmse,
    'Next_Day_Prediction': one_day_pred
})
results_all_be_features_rmse_and_predictions = results_df
results_all_be_features_rmse_and_predictions.to_csv('results_all_be_features_rmse_and_predictions.csv', index=True)
files.download('results_all_be_features_rmse_and_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
results_all_be_features_rmse_and_predictions = pd.read_csv('results_all_be_features_rmse_and_predictions.csv')
results_all_be_features_rmse_and_predictions = pd.DataFrame(results_all_be_features_rmse_and_predictions)

# Random Forest Model Discretionary Selected Features

## Basic Random Forest Model

In [None]:
df = master_df[['Shifted Adj Close','dividendsPaid', '^VIX', 'ivmean30', 'ivput270', 'Grade', 'weightedScore', 'revenue', 'netIncome',
              'totalAssets', 'eps', 'operatingCashFlowPerShare', 'freeCashFlowPerShare','Surprise', 'congress_net_trade',
               'Sentiment', 'stocktwitsSentiment', 'twitterSentiment', 'news_sentimentScore', 'Dividend_y',
                'netProfitMargin', 'returnOnEquity', 'ebitda', 'roic']]
df = df[253:]
last_features_row = df.drop(columns=['Shifted Adj Close']).iloc[-1].values.reshape(1, -1)

In [None]:
X = df.drop(columns=['Shifted Adj Close'])
y = df['Shifted Adj Close']

# Initialize TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Placeholder list to store RMSE for each fold
rmse_scores = []

# Placeholder for feature importances
feature_importances = None

for train_index, test_index in tqdm(tscv.split(X), total=tscv.n_splits):
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

  # Train the model
  model = RandomForestRegressor(n_estimators=10000, random_state=42)
  model.fit(X_train, y_train)

  # Make predictions
  y_pred = model.predict(X_test)

  # Compute and store RMSE for the current fold
  rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))

  feature_importances = pd.Series(model.feature_importances_, index=X.columns)

# Average RMSE across folds
average_rmse = np.mean(rmse_scores)
print(f"Average RMSE: {average_rmse}")


100%|██████████| 5/5 [02:33<00:00, 30.79s/it]

Average RMSE: 43.1642011637954





In [None]:
sorted_feature_importances = feature_importances.sort_values(ascending=False)
top_eight_features = sorted_feature_importances[0:8]
top_eight_features

eps                 0.220288
netIncome           0.218502
twitterSentiment    0.215219
netProfitMargin     0.106305
totalAssets         0.071295
returnOnEquity      0.031568
ebitda              0.023007
revenue             0.022562
dtype: float64

In [None]:
next_day_prediction = model.predict((last_features_row))

print(f"Next day predicted 'Adj Close': {next_day_prediction[0]}")



Next day predicted 'Adj Close': 331.25040352478027


In [None]:
rmse_df_selected_basic = pd.DataFrame({'RMSE Scores': [average_rmse]})
rmse_df_selected_basic.to_csv('results_selected_basic_rmse_scores.csv', index=False)
files.download('results_selected_basic_rmse_scores.csv')

# Feature importances
sorted_feature_importances.to_csv('results_selected_basic_sorted_feature_importances.csv', index=True)
files.download('results_selected_basic_sorted_feature_importances.csv')

# Price prediction
predictions_df = pd.DataFrame(next_day_prediction, columns=['Prediction'])
predictions_df.to_csv('results_selected_basic_random_forest_prediction.csv', index=True)
files.download('results_selected_basic_random_forest_prediction.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
rmse_df_selected_basic = pd.read_csv('results_selected_basic_rmse_scores.csv')
rmse_df_selected_basic = pd.DataFrame(rmse_df_selected_basic)

## Train with Top Features

In [None]:
selected_train_top_features_rmse_scores_list = train_with_top_features_and_predict(X, y, top_eight_features)

100%|██████████| 8/8 [06:43<00:00, 50.46s/it]


In [None]:
print("RMSE scores for models with incremental top features:", selected_train_top_features_rmse_scores_list)

RMSE scores for models with incremental top features: ([38.43422975359275, 38.26421569345608, 34.40223457698411, 37.47378480587942, 43.51139570894516, 43.836223845872134, 45.743598893183304, 45.86943292878883], 400.7212232870456, RandomForestRegressor(n_estimators=7000, random_state=42))


In [None]:
num_top_features = list(range(1, len(top_eight_features) + 1))

# Create a DataFrame for RMSE scores
rmse_df_selected_train_top_features = pd.DataFrame({
    'Num_Top_Features': num_top_features,
    'RMSE_Scores': selected_train_top_features_rmse_scores_list[0]
})

# Add the next_day_pred as a new column (repeating the value for all rows for demonstration)
rmse_df_selected_train_top_features['Next_Day_Prediction based on final model'] = next_day_pred

rmse_df_selected_train_top_features.to_csv('results_selected_train_top_features_rmse_and_prediction.csv', index=False)
files.download('results_selected_train_top_features_rmse_and_prediction.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
rmse_df_selected_train_top_features = pd.read_csv('results_selected_train_top_features_rmse_and_prediction.csv')
rmse_df_selected_train_top_features = pd.DataFrame(rmse_df_selected_train_top_features)

## Random Forest With Grid Search CV to get Top Features

In [None]:
# Model with GridSearchCV
X = df.drop(columns=['Shifted Adj Close'])
y = df['Shifted Adj Close']

# Define the parameter grid to search
param_grid = {
    'n_estimators': [1500, 3000, 6000],  # List of numbers of trees
    'max_depth': [None, 30, 120],  # Maximum depth of trees, including 'None' for full growth
    'min_samples_split': [2, 12, 20], # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 3, 8],    # Minimum number of samples required at each leaf node
    'max_features': ['auto', 'sqrt', 40]  # Number of features to consider at every split
}

# Initialize the model
model = RandomForestRegressor(random_state=42)

tscv = TimeSeriesSplit(n_splits=5)

# Define RMSE as the scoring criteria
rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

# GridSearchCV with TimeSeriesSplit cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=tscv, scoring=rmse_scorer, verbose=1, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X, y)

# Access the best estimator
best_model = grid_search.best_estimator_

# Feature importances from the best model
feature_importances = pd.Series(best_model.feature_importances_, index=X.columns)

# Sort feature importances in descending order and select top 10
sorted_feature_importances = feature_importances.sort_values(ascending=False)
top_ten_features = sorted_feature_importances[:10]

last_features_row = X.iloc[-1].values.reshape(1, -1)

next_day_prediction = best_model.predict(last_features_row)

# Print the results
print("Best parameters:", grid_search.best_params_)
print("Best RMSE score:", -grid_search.best_score_)
print(f"Top 10 features:\n{top_ten_features}")
print("Next day prediction:", next_day_prediction[0])

In [None]:
# RMSE score
rmse_scores = -grid_search.best_score_
rmse_df = pd.DataFrame({'RMSE Scores': [rmse_scores]})
rmse_df.to_csv('results_selected_gs_rmse_scores.csv', index=False)
files.download('results_selected_gs_rmse_scores.csv')

# Feature importances
sorted_feature_importances.to_csv('results_selected_gs_sorted_feature_importances.csv', index=True)
files.download('results_selected_gs_sorted_feature_importances.csv')

# Price prediction
predictions_df = pd.DataFrame(next_day_prediction, columns=['Prediction'])
predictions_df.to_csv('results_selected_gs_prediction.csv', index=True)
files.download('results_selected_gs_prediction.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
results_selected_gs_rmse_scores = pd.read_csv('results_selected_gs_rmse_scores.csv')
results_selected_gs_rmse_scores = pd.DataFrame(results_selected_gs_rmse_scores)

In [None]:
results_selected_gs_features = pd.read_csv('results_selected_gs_sorted_feature_importances.csv')
results_selected_gs_features = pd.DataFrame(results_selected_gs_features)

In [None]:
results_selected_gs_prediction = pd.read_csv('results_selected_gs_prediction.csv')
results_selected_gs_prediction = pd.DataFrame(results_selected_gs_prediction)

## Random Forest with Forward Selection

In [None]:
X = df.drop(columns=['Shifted Adj Close'])
y = df['Shifted Adj Close']

# Split the data into training and testing sets
tscv = TimeSeriesSplit(n_splits=5)

# List to keep track of selected features and performance
selected_features = []
remaining_features = list(X_train.columns)
best_score = float('inf')

while remaining_features and len(selected_features) < 8:
  score_improvement = False
  for feature in remaining_features:
    # Test adding the current feature
    trial_features = selected_features + [feature]
    trial_X = X[trial_features]

    # Cross-validate the model with the current set of features
    rmse_scores = []
    for train_index, test_index in tscv.split(trial_X):
      X_train, X_test = trial_X.iloc[train_index], trial_X.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]

      model = RandomForestRegressor(n_estimators=1000, random_state=42)
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)
      rmse_score = np.sqrt(mean_squared_error(y_test, y_pred))
      rmse_scores.append(rmse_score)

    # Calculate the average RMSE across time-series splits
    average_rmse = np.mean(rmse_scores)

    # Check if we've improved the RMSE
    if average_rmse < best_score:
      best_score = average_rmse
      best_feature = feature
      score_improvement = True

  # If a feature improves the model, add it to the selected features
  if score_improvement:
    selected_features.append(best_feature)
    remaining_features.remove(best_feature)
  else:
    # If no improvement, exit the while loop
    break

print("Selected features:", selected_features)
print("Best RMSE Score:", best_score)

# Train the final model on the selected features
# Use the last split as the hold-out test set
train_index, test_index = list(tscv.split(X[selected_features]))[-1]
X_train, X_test = X[selected_features].iloc[train_index], X[selected_features].iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

final_model = RandomForestRegressor(n_estimators=100, random_state=42)
final_model.fit(X_train, y_train)
final_predictions = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, final_predictions))

# Predict the next day (using the last available features)
last_features_row = X[selected_features].iloc[-1].values.reshape(1, -1)
next_day_prediction = final_model.predict(last_features_row)

print("Final RMSE on the hold-out test set:", final_rmse)
print("Next day prediction:", next_day_prediction[0])

In [None]:
results_df = pd.DataFrame({
    'Selected_Features': selected_features,
    'RMSE': final_rmse,
    'Next_Day_Prediction': next_day_prediction[0]
})
results_selected_fs_features_rmse_and_predictions = results_df
results_selected_fs_features_rmse_and_predictions.to_csv('results_selected_fs_features_rmse_and_predictions.csv', index=True)
files.download('results_selected_fs_features_rmse_and_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
results_selected_fs_features_rmse_and_predictions = pd.read_csv('results_selected_fs_features_rmse_and_predictions.csv')
results_selected_fs_features_rmse_and_predictions = pd.DataFrame(results_selected_fs_features_rmse_and_predictions)

## Random Forest with Backward Elimination

In [None]:
X = df.drop(columns=['Shifted Adj Close'])
y = df['Shifted Adj Close']
best_features, best_rmse, one_day_pred = rf_backward_elimination(X, y)

print("Selected features:", best_features)
print("Final RMSE on the hold-out test set:", best_rmse)
print("Next day prediction:", one_day_pred)


Evaluating features:   0%|          | 0/23 [00:00<?, ?it/s]

Processing dividendsPaid...


Evaluating features:   4%|▍         | 1/23 [00:01<00:36,  1.68s/it]

Processing ^VIX...


Evaluating features:   9%|▊         | 2/23 [00:03<00:33,  1.59s/it]

Processing ivmean30...


Evaluating features:  13%|█▎        | 3/23 [00:04<00:30,  1.55s/it]

Processing ivput270...


Evaluating features:  17%|█▋        | 4/23 [00:06<00:28,  1.51s/it]

Processing Grade...


Evaluating features:  22%|██▏       | 5/23 [00:07<00:27,  1.52s/it]

Processing weightedScore...


Evaluating features:  26%|██▌       | 6/23 [00:09<00:25,  1.52s/it]

Processing revenue...


Evaluating features:  30%|███       | 7/23 [00:10<00:24,  1.51s/it]

Processing netIncome...


Evaluating features:  35%|███▍      | 8/23 [00:12<00:22,  1.51s/it]

Processing totalAssets...


Evaluating features:  39%|███▉      | 9/23 [00:13<00:20,  1.49s/it]

Processing eps...


Evaluating features:  43%|████▎     | 10/23 [00:15<00:19,  1.50s/it]

Processing operatingCashFlowPerShare...


Evaluating features:  48%|████▊     | 11/23 [00:16<00:17,  1.49s/it]

Processing freeCashFlowPerShare...


Evaluating features:  52%|█████▏    | 12/23 [00:18<00:16,  1.47s/it]

Processing Surprise...


Evaluating features:  57%|█████▋    | 13/23 [00:19<00:14,  1.48s/it]

Processing congress_net_trade...


Evaluating features:  61%|██████    | 14/23 [00:21<00:13,  1.49s/it]

Processing Sentiment...


Evaluating features:  65%|██████▌   | 15/23 [00:22<00:12,  1.50s/it]

Processing stocktwitsSentiment...


Evaluating features:  70%|██████▉   | 16/23 [00:24<00:10,  1.49s/it]

Processing twitterSentiment...


Evaluating features:  74%|███████▍  | 17/23 [00:25<00:08,  1.50s/it]

Processing news_sentimentScore...


Evaluating features:  78%|███████▊  | 18/23 [00:27<00:07,  1.50s/it]

Processing Dividend_y...


Evaluating features:  83%|████████▎ | 19/23 [00:28<00:06,  1.51s/it]

Processing netProfitMargin...


Evaluating features:  87%|████████▋ | 20/23 [00:30<00:04,  1.49s/it]

Processing returnOnEquity...


Evaluating features:  91%|█████████▏| 21/23 [00:31<00:02,  1.48s/it]

Processing ebitda...


Evaluating features:  96%|█████████▌| 22/23 [00:33<00:01,  1.50s/it]

Processing roic...


Evaluating features: 100%|██████████| 23/23 [00:34<00:00,  1.50s/it]


Removed netProfitMargin, Best RMSE: 37.28028085855479


Evaluating features:   0%|          | 0/22 [00:00<?, ?it/s]

Processing dividendsPaid...


Evaluating features:   5%|▍         | 1/22 [00:01<00:29,  1.42s/it]

Processing ^VIX...


Evaluating features:   9%|▉         | 2/22 [00:02<00:27,  1.40s/it]

Processing ivmean30...


Evaluating features:  14%|█▎        | 3/22 [00:04<00:26,  1.39s/it]

Processing ivput270...


Evaluating features:  18%|█▊        | 4/22 [00:05<00:24,  1.38s/it]

Processing Grade...


Evaluating features:  23%|██▎       | 5/22 [00:06<00:23,  1.39s/it]

Processing weightedScore...


Evaluating features:  27%|██▋       | 6/22 [00:08<00:22,  1.40s/it]

Processing revenue...


Evaluating features:  32%|███▏      | 7/22 [00:09<00:21,  1.40s/it]

Processing netIncome...


Evaluating features:  36%|███▋      | 8/22 [00:11<00:19,  1.40s/it]

Processing totalAssets...


Evaluating features:  41%|████      | 9/22 [00:12<00:18,  1.39s/it]

Processing eps...


Evaluating features:  45%|████▌     | 10/22 [00:13<00:16,  1.39s/it]

Processing operatingCashFlowPerShare...


Evaluating features:  50%|█████     | 11/22 [00:15<00:15,  1.39s/it]

Processing freeCashFlowPerShare...


Evaluating features:  55%|█████▍    | 12/22 [00:16<00:13,  1.38s/it]

Processing Surprise...


Evaluating features:  59%|█████▉    | 13/22 [00:18<00:12,  1.39s/it]

Processing congress_net_trade...


Evaluating features:  64%|██████▎   | 14/22 [00:19<00:11,  1.39s/it]

Processing Sentiment...


Evaluating features:  68%|██████▊   | 15/22 [00:20<00:09,  1.40s/it]

Processing stocktwitsSentiment...


Evaluating features:  73%|███████▎  | 16/22 [00:22<00:08,  1.39s/it]

Processing twitterSentiment...


Evaluating features:  77%|███████▋  | 17/22 [00:23<00:07,  1.40s/it]

Processing news_sentimentScore...


Evaluating features:  82%|████████▏ | 18/22 [00:25<00:05,  1.40s/it]

Processing Dividend_y...


Evaluating features:  86%|████████▋ | 19/22 [00:26<00:04,  1.42s/it]

Processing returnOnEquity...


Evaluating features:  91%|█████████ | 20/22 [00:27<00:02,  1.41s/it]

Processing ebitda...


Evaluating features:  95%|█████████▌| 21/22 [00:29<00:01,  1.41s/it]

Processing roic...


Evaluating features: 100%|██████████| 22/22 [00:30<00:00,  1.40s/it]


Removed dividendsPaid, Best RMSE: 34.73731918510917


Evaluating features:   0%|          | 0/21 [00:00<?, ?it/s]

Processing ^VIX...


Evaluating features:   5%|▍         | 1/21 [00:01<00:26,  1.32s/it]

Processing ivmean30...


Evaluating features:  10%|▉         | 2/21 [00:02<00:25,  1.32s/it]

Processing ivput270...


Evaluating features:  14%|█▍        | 3/21 [00:03<00:23,  1.32s/it]

Processing Grade...


Evaluating features:  19%|█▉        | 4/21 [00:05<00:23,  1.36s/it]

Processing weightedScore...


Evaluating features:  24%|██▍       | 5/21 [00:06<00:21,  1.37s/it]

Processing revenue...


Evaluating features:  29%|██▊       | 6/21 [00:08<00:20,  1.38s/it]

Processing netIncome...


Evaluating features:  33%|███▎      | 7/21 [00:09<00:19,  1.38s/it]

Processing totalAssets...


Evaluating features:  38%|███▊      | 8/21 [00:10<00:17,  1.37s/it]

Processing eps...


Evaluating features:  43%|████▎     | 9/21 [00:12<00:16,  1.37s/it]

Processing operatingCashFlowPerShare...


Evaluating features:  48%|████▊     | 10/21 [00:13<00:15,  1.37s/it]

Processing freeCashFlowPerShare...


Evaluating features:  52%|█████▏    | 11/21 [00:14<00:13,  1.36s/it]

Processing Surprise...


Evaluating features:  57%|█████▋    | 12/21 [00:16<00:12,  1.37s/it]

Processing congress_net_trade...


Evaluating features:  62%|██████▏   | 13/21 [00:17<00:10,  1.37s/it]

Processing Sentiment...


Evaluating features:  67%|██████▋   | 14/21 [00:19<00:09,  1.39s/it]

Processing stocktwitsSentiment...


Evaluating features:  71%|███████▏  | 15/21 [00:20<00:08,  1.38s/it]

Processing twitterSentiment...


Evaluating features:  76%|███████▌  | 16/21 [00:21<00:06,  1.38s/it]

Processing news_sentimentScore...


Evaluating features:  81%|████████  | 17/21 [00:23<00:05,  1.38s/it]

Processing Dividend_y...


Evaluating features:  86%|████████▌ | 18/21 [00:24<00:04,  1.39s/it]

Processing returnOnEquity...


Evaluating features:  90%|█████████ | 19/21 [00:26<00:02,  1.39s/it]

Processing ebitda...


Evaluating features:  95%|█████████▌| 20/21 [00:27<00:01,  1.40s/it]

Processing roic...


Evaluating features: 100%|██████████| 21/21 [00:28<00:00,  1.38s/it]


Removed operatingCashFlowPerShare, Best RMSE: 34.0870186097859


Evaluating features:   0%|          | 0/20 [00:00<?, ?it/s]

Processing ^VIX...


Evaluating features:   5%|▌         | 1/20 [00:01<00:24,  1.28s/it]

Processing ivmean30...


Evaluating features:  10%|█         | 2/20 [00:02<00:22,  1.27s/it]

Processing ivput270...


Evaluating features:  15%|█▌        | 3/20 [00:03<00:21,  1.28s/it]

Processing Grade...


Evaluating features:  20%|██        | 4/20 [00:05<00:20,  1.30s/it]

Processing weightedScore...


Evaluating features:  25%|██▌       | 5/20 [00:06<00:19,  1.31s/it]

Processing revenue...


Evaluating features:  30%|███       | 6/20 [00:07<00:18,  1.32s/it]

Processing netIncome...


Evaluating features:  35%|███▌      | 7/20 [00:09<00:17,  1.33s/it]

Processing totalAssets...


Evaluating features:  40%|████      | 8/20 [00:10<00:15,  1.32s/it]

Processing eps...


Evaluating features:  45%|████▌     | 9/20 [00:11<00:14,  1.33s/it]

Processing freeCashFlowPerShare...


Evaluating features:  50%|█████     | 10/20 [00:13<00:13,  1.32s/it]

Processing Surprise...


Evaluating features:  55%|█████▌    | 11/20 [00:14<00:11,  1.32s/it]

Processing congress_net_trade...


Evaluating features:  60%|██████    | 12/20 [00:15<00:10,  1.32s/it]

Processing Sentiment...


Evaluating features:  65%|██████▌   | 13/20 [00:17<00:09,  1.33s/it]

Processing stocktwitsSentiment...


Evaluating features:  70%|███████   | 14/20 [00:18<00:07,  1.32s/it]

Processing twitterSentiment...


Evaluating features:  75%|███████▌  | 15/20 [00:19<00:06,  1.32s/it]

Processing news_sentimentScore...


Evaluating features:  80%|████████  | 16/20 [00:21<00:05,  1.33s/it]

Processing Dividend_y...


Evaluating features:  85%|████████▌ | 17/20 [00:22<00:03,  1.33s/it]

Processing returnOnEquity...


Evaluating features:  90%|█████████ | 18/20 [00:23<00:02,  1.33s/it]

Processing ebitda...


Evaluating features:  95%|█████████▌| 19/20 [00:25<00:01,  1.33s/it]

Processing roic...


Evaluating features: 100%|██████████| 20/20 [00:26<00:00,  1.32s/it]


Removed totalAssets, Best RMSE: 33.46828890726362


Evaluating features:   0%|          | 0/19 [00:00<?, ?it/s]

Processing ^VIX...


Evaluating features:   5%|▌         | 1/19 [00:01<00:21,  1.20s/it]

Processing ivmean30...


Evaluating features:  11%|█         | 2/19 [00:02<00:20,  1.22s/it]

Processing ivput270...


Evaluating features:  16%|█▌        | 3/19 [00:03<00:19,  1.22s/it]

Processing Grade...


Evaluating features:  21%|██        | 4/19 [00:04<00:18,  1.23s/it]

Processing weightedScore...


Evaluating features:  26%|██▋       | 5/19 [00:06<00:17,  1.24s/it]

Processing revenue...


Evaluating features:  32%|███▏      | 6/19 [00:07<00:16,  1.24s/it]

Processing netIncome...


Evaluating features:  37%|███▋      | 7/19 [00:08<00:15,  1.26s/it]

Processing eps...


Evaluating features:  42%|████▏     | 8/19 [00:09<00:13,  1.27s/it]

Processing freeCashFlowPerShare...


Evaluating features:  47%|████▋     | 9/19 [00:11<00:12,  1.25s/it]

Processing Surprise...


Evaluating features:  53%|█████▎    | 10/19 [00:12<00:11,  1.26s/it]

Processing congress_net_trade...


Evaluating features:  58%|█████▊    | 11/19 [00:13<00:10,  1.26s/it]

Processing Sentiment...


Evaluating features:  63%|██████▎   | 12/19 [00:15<00:08,  1.26s/it]

Processing stocktwitsSentiment...


Evaluating features:  68%|██████▊   | 13/19 [00:16<00:07,  1.26s/it]

Processing twitterSentiment...


Evaluating features:  74%|███████▎  | 14/19 [00:17<00:06,  1.26s/it]

Processing news_sentimentScore...


Evaluating features:  79%|███████▉  | 15/19 [00:18<00:05,  1.26s/it]

Processing Dividend_y...


Evaluating features:  84%|████████▍ | 16/19 [00:20<00:03,  1.27s/it]

Processing returnOnEquity...


Evaluating features:  89%|████████▉ | 17/19 [00:21<00:02,  1.25s/it]

Processing ebitda...


Evaluating features:  95%|█████████▍| 18/19 [00:22<00:01,  1.25s/it]

Processing roic...


Evaluating features: 100%|██████████| 19/19 [00:23<00:00,  1.25s/it]


Removed ebitda, Best RMSE: 33.003652675849416


Evaluating features:   0%|          | 0/18 [00:00<?, ?it/s]

Processing ^VIX...


Evaluating features:   6%|▌         | 1/18 [00:01<00:20,  1.19s/it]

Processing ivmean30...


Evaluating features:  11%|█         | 2/18 [00:02<00:18,  1.18s/it]

Processing ivput270...


Evaluating features:  17%|█▋        | 3/18 [00:03<00:17,  1.18s/it]

Processing Grade...


Evaluating features:  22%|██▏       | 4/18 [00:04<00:17,  1.22s/it]

Processing weightedScore...


Evaluating features:  28%|██▊       | 5/18 [00:06<00:15,  1.23s/it]

Processing revenue...


Evaluating features:  33%|███▎      | 6/18 [00:07<00:14,  1.24s/it]

Processing netIncome...


Evaluating features:  39%|███▉      | 7/18 [00:08<00:13,  1.25s/it]

Processing eps...


Evaluating features:  44%|████▍     | 8/18 [00:09<00:12,  1.26s/it]

Processing freeCashFlowPerShare...


Evaluating features:  50%|█████     | 9/18 [00:11<00:11,  1.24s/it]

Processing Surprise...


Evaluating features:  56%|█████▌    | 10/18 [00:12<00:09,  1.24s/it]

Processing congress_net_trade...


Evaluating features:  61%|██████    | 11/18 [00:13<00:08,  1.24s/it]

Processing Sentiment...


Evaluating features:  67%|██████▋   | 12/18 [00:14<00:07,  1.25s/it]

Processing stocktwitsSentiment...


Evaluating features:  72%|███████▏  | 13/18 [00:16<00:06,  1.24s/it]

Processing twitterSentiment...


Evaluating features:  78%|███████▊  | 14/18 [00:17<00:04,  1.24s/it]

Processing news_sentimentScore...


Evaluating features:  83%|████████▎ | 15/18 [00:18<00:03,  1.24s/it]

Processing Dividend_y...


Evaluating features:  89%|████████▉ | 16/18 [00:19<00:02,  1.26s/it]

Processing returnOnEquity...


Evaluating features:  94%|█████████▍| 17/18 [00:21<00:01,  1.24s/it]

Processing roic...


Evaluating features: 100%|██████████| 18/18 [00:22<00:00,  1.24s/it]


Removed ivmean30, Best RMSE: 32.8833451296375


Evaluating features:   0%|          | 0/17 [00:00<?, ?it/s]

Processing ^VIX...


Evaluating features:   6%|▌         | 1/17 [00:01<00:17,  1.11s/it]

Processing ivput270...


Evaluating features:  12%|█▏        | 2/17 [00:02<00:16,  1.11s/it]

Processing Grade...


Evaluating features:  18%|█▊        | 3/17 [00:03<00:15,  1.13s/it]

Processing weightedScore...


Evaluating features:  24%|██▎       | 4/17 [00:04<00:14,  1.15s/it]

Processing revenue...


Evaluating features:  29%|██▉       | 5/17 [00:05<00:13,  1.15s/it]

Processing netIncome...


Evaluating features:  35%|███▌      | 6/17 [00:06<00:12,  1.17s/it]

Processing eps...


Evaluating features:  41%|████      | 7/17 [00:08<00:11,  1.19s/it]

Processing freeCashFlowPerShare...


Evaluating features:  47%|████▋     | 8/17 [00:09<00:10,  1.16s/it]

Processing Surprise...


Evaluating features:  53%|█████▎    | 9/17 [00:10<00:09,  1.17s/it]

Processing congress_net_trade...


Evaluating features:  59%|█████▉    | 10/17 [00:11<00:08,  1.16s/it]

Processing Sentiment...


Evaluating features:  65%|██████▍   | 11/17 [00:12<00:07,  1.18s/it]

Processing stocktwitsSentiment...


Evaluating features:  71%|███████   | 12/17 [00:13<00:05,  1.18s/it]

Processing twitterSentiment...


Evaluating features:  76%|███████▋  | 13/17 [00:15<00:04,  1.19s/it]

Processing news_sentimentScore...


Evaluating features:  82%|████████▏ | 14/17 [00:16<00:03,  1.19s/it]

Processing Dividend_y...


Evaluating features:  88%|████████▊ | 15/17 [00:17<00:02,  1.20s/it]

Processing returnOnEquity...


Evaluating features:  94%|█████████▍| 16/17 [00:18<00:01,  1.18s/it]

Processing roic...


Evaluating features: 100%|██████████| 17/17 [00:19<00:00,  1.17s/it]


Removed returnOnEquity, Best RMSE: 32.80830975494912


Evaluating features:   0%|          | 0/16 [00:00<?, ?it/s]

Processing ^VIX...


Evaluating features:   6%|▋         | 1/16 [00:01<00:15,  1.04s/it]

Processing ivput270...


Evaluating features:  12%|█▎        | 2/16 [00:02<00:14,  1.05s/it]

Processing Grade...


Evaluating features:  19%|█▉        | 3/16 [00:03<00:14,  1.08s/it]

Processing weightedScore...


Evaluating features:  25%|██▌       | 4/16 [00:04<00:13,  1.09s/it]

Processing revenue...


Evaluating features:  31%|███▏      | 5/16 [00:05<00:12,  1.12s/it]

Processing netIncome...


Evaluating features:  38%|███▊      | 6/16 [00:06<00:11,  1.12s/it]

Processing eps...


Evaluating features:  44%|████▍     | 7/16 [00:07<00:10,  1.13s/it]

Processing freeCashFlowPerShare...


Evaluating features:  50%|█████     | 8/16 [00:08<00:08,  1.10s/it]

Processing Surprise...


Evaluating features:  56%|█████▋    | 9/16 [00:09<00:07,  1.11s/it]

Processing congress_net_trade...


Evaluating features:  62%|██████▎   | 10/16 [00:11<00:06,  1.13s/it]

Processing Sentiment...


Evaluating features:  69%|██████▉   | 11/16 [00:12<00:05,  1.13s/it]

Processing stocktwitsSentiment...


Evaluating features:  75%|███████▌  | 12/16 [00:13<00:04,  1.11s/it]

Processing twitterSentiment...


Evaluating features:  81%|████████▏ | 13/16 [00:14<00:03,  1.11s/it]

Processing news_sentimentScore...


Evaluating features:  88%|████████▊ | 14/16 [00:15<00:02,  1.11s/it]

Processing Dividend_y...


Evaluating features:  94%|█████████▍| 15/16 [00:16<00:01,  1.11s/it]

Processing roic...


Evaluating features: 100%|██████████| 16/16 [00:17<00:00,  1.11s/it]


Removed Surprise, Best RMSE: 32.459646301235985


Evaluating features:   0%|          | 0/15 [00:00<?, ?it/s]

Processing ^VIX...


Evaluating features:   7%|▋         | 1/15 [00:01<00:14,  1.02s/it]

Processing ivput270...


Evaluating features:  13%|█▎        | 2/15 [00:02<00:13,  1.02s/it]

Processing Grade...


Evaluating features:  20%|██        | 3/15 [00:03<00:12,  1.04s/it]

Processing weightedScore...


Evaluating features:  27%|██▋       | 4/15 [00:04<00:11,  1.07s/it]

Processing revenue...


Evaluating features:  33%|███▎      | 5/15 [00:05<00:10,  1.08s/it]

Processing netIncome...


Evaluating features:  40%|████      | 6/15 [00:06<00:09,  1.08s/it]

Processing eps...


Evaluating features:  47%|████▋     | 7/15 [00:07<00:08,  1.09s/it]

Processing freeCashFlowPerShare...


Evaluating features:  53%|█████▎    | 8/15 [00:08<00:07,  1.06s/it]

Processing congress_net_trade...


Evaluating features:  60%|██████    | 9/15 [00:09<00:06,  1.06s/it]

Processing Sentiment...


Evaluating features:  67%|██████▋   | 10/15 [00:10<00:05,  1.08s/it]

Processing stocktwitsSentiment...


Evaluating features:  73%|███████▎  | 11/15 [00:11<00:04,  1.08s/it]

Processing twitterSentiment...


Evaluating features:  80%|████████  | 12/15 [00:12<00:03,  1.09s/it]

Processing news_sentimentScore...


Evaluating features:  87%|████████▋ | 13/15 [00:13<00:02,  1.09s/it]

Processing Dividend_y...


Evaluating features:  93%|█████████▎| 14/15 [00:15<00:01,  1.09s/it]

Processing roic...


Evaluating features: 100%|██████████| 15/15 [00:16<00:00,  1.07s/it]


No further improvement.
Selected features: ['^VIX', 'ivput270', 'Grade', 'weightedScore', 'revenue', 'netIncome', 'eps', 'freeCashFlowPerShare', 'congress_net_trade', 'Sentiment', 'stocktwitsSentiment', 'twitterSentiment', 'news_sentimentScore', 'Dividend_y', 'roic']
Final RMSE on the hold-out test set: 32.459646301235985
Next day prediction: 400.4585003051758




In [None]:
results_df = pd.DataFrame({
    'Best_Features': best_features,
    'RMSE': best_rmse,
    'Next_Day_Prediction': one_day_pred
})
results_selected_be_features_rmse_and_predictions = results_df
results_selected_be_features_rmse_and_predictions.to_csv('results_selected_be_features_rmse_and_predictions.csv', index=True)
files.download('results_selected_be_features_rmse_and_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
results_selected_be_features_rmse_and_predictions = pd.read_csv('results_selected_be_features_rmse_and_predictions.csv')
results_selected_be_features_rmse_and_predictions = pd.DataFrame(results_selected_be_features_rmse_and_predictions)

# LSTM Model

## LSTM Model with Past Prices only

In [None]:
def lstm_forecast_with_tuning_log(data_series_base, n_steps=5, n_features=1):
  """
  LSTM Model for Past Prices only
  """
  if not isinstance(data_series_base, pd.Series):
    raise ValueError("data_series_base must be a pandas Series.")

  log_returns = np.log(data_series_base / data_series_base.shift(1)).dropna()
  data = log_returns.values

  def preprocess_data(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
      end_ix = i + n_steps
      seq_x, seq_y = data[i:end_ix], data[end_ix]
      X.append(seq_x)
      y.append(seq_y)
    return np.array(X), np.array(y)

  X, y = preprocess_data(data, n_steps)
  X = X.reshape((X.shape[0], X.shape[1], n_features))

  tscv = TimeSeriesSplit(n_splits=4)
  metrics_list = []

  for train_index, test_index in tscv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    def build_model(hp):
      model = Sequential()
      model.add(LSTM(units=hp.Int('input_unit', min_value=32, max_value=256, step=32),
                      return_sequences=True, input_shape=(n_steps, n_features)))
      for i in range(hp.Int('n_layers', 1, 1)):
          model.add(LSTM(units=hp.Int(f'lstm_{i}_units', min_value=32, max_value=256, step=32),
                          return_sequences=(i < hp.Int('n_layers', 1, 1) - 1)))
      model.add(Dropout(hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)))
      model.add(Dense(1, activation=hp.Choice('dense_activation', values=['relu', 'sigmoid'], default='relu')))
      model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mse'])
      return model

    LOG_DIR = f"{int(time.time())}"
    tuner = RandomSearch(build_model,
                          objective=Objective("mse", direction="min"),
                          max_trials=5,
                          executions_per_trial=1,
                          directory=LOG_DIR)

    tuner.search(x=X_train, y=y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=0)

    best_model = tuner.get_best_models(num_models=1)[0]

    y_pred = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r_squared = r2_score(y_test, y_pred)
    n = len(y_test)
    k = 1
    adjusted_r_squared = 1 - ((1 - r_squared) * (n - 1) / (n - k - 1))
    metrics_list.append((rmse, adjusted_r_squared))

  avg_rmse = np.mean([m[0] for m in metrics_list])
  avg_adjusted_r_squared = np.mean([m[1] for m in metrics_list])

  forecast_input = data[-n_steps:].reshape((1, n_steps, n_features))
  forecast = [best_model.predict(forecast_input)[0][0]]
  forecast_input = np.append(forecast_input.flatten()[1:], forecast).reshape((1, n_steps, n_features))
  forecast.append(best_model.predict(forecast_input)[0][0])

  last_price = data_series_base[-1]

  # Forecasted log returns
  log_return_day1 = forecast[0]
  log_return_day2 = forecast[1]

  # Convert log returns to actual prices
  forecast_price_day1 = last_price * np.exp(log_return_day1)
  forecast_price_day2 = forecast_price_day1 * np.exp(log_return_day2)

  results_df = pd.DataFrame({
      'Average RMSE': [avg_rmse],
      'Average Adjusted R-Squared': [avg_adjusted_r_squared],
      '1 Day Forecast': forecast_price_day1,
      '2 Day Forecast': forecast_price_day2
  })

  results_df.index = [data_series_base.name]

  return results_df


In [None]:
lstm_past_prices_results = lstm_forecast_with_tuning_log(msft_df['Adj Close'])











In [None]:
lstm_past_prices_results

Unnamed: 0,Average RMSE,Average Adjusted R-Squared,1 Day Forecast,2 Day Forecast
Adj Close,0.016611,-0.011936,402.701401,402.752624


In [None]:
# lstm_past_prices_results.to_csv('results_lstm_past_prices_results.csv', index=True)
# files.download('results_lstm_past_prices_results.csv')
lstm_past_prices_results = pd.read_csv('results_lstm_past_prices_results.csv')
lstm_past_prices_results = pd.DataFrame(lstm_past_prices_results)

## LSTM Model with All Features

In [None]:
columns_except = [col for col in master_df.columns if col not in ['Adj Close', 'Shifted Adj Close']]

In [None]:
lstm_all_features_results = lstm_model_with_tuning(master_df, 'Shifted Adj Close', columns_except)

Reloading Tuner from my_dir/f/tuner0.json




Reloading Tuner from my_dir/f/tuner0.json
Reloading Tuner from my_dir/f/tuner0.json




Reloading Tuner from my_dir/f/tuner0.json






In [None]:
lstm_all_features_results

Unnamed: 0,Average RMSE,"Forecast March 5, 2024","Forecast March 6, 2024"
0,76.522759,261.475311,261.263824


In [None]:
# lstm_all_features_results.to_csv('results_lstm_all_features.csv', index=True)
# files.download('results_lstm_all_features.csv')
lstm_all_features_results = pd.read_csv('results_lstm_all_features.csv')
lstm_all_features_results = pd.DataFrame(lstm_all_features_results)

## LSTM Model with Selected Features

In [None]:
columns_except = [col for col in df.columns if col not in ['Shifted Adj Close']]
lstm_selected_features_results = lstm_model_with_tuning(df, 'Shifted Adj Close',columns_except)

Trial 15 Complete [00h 00m 10s]
mse: 0.0276452898979187

Best mse So Far: 0.006463322788476944
Total elapsed time: 00h 02m 26s
Reloading Tuner from my_dir/g/tuner0.json
Reloading Tuner from my_dir/g/tuner0.json




Reloading Tuner from my_dir/g/tuner0.json






In [None]:
lstm_selected_features_results

Unnamed: 0,Average RMSE,"Forecast March 5, 2024","Forecast March 6, 2024"
0,35.884046,309.767822,309.995453


In [None]:
# lstm_selected_features_results.to_csv('results_lstm_selected_features.csv', index=True)
# files.download('results_lstm_selected_features.csv')
lstm_selected_features_results = pd.read_csv('results_lstm_selected_features.csv')
lstm_selected_features_results = pd.DataFrame(lstm_selected_features_results)

# Results Summary

In [None]:
rf_results_summary_dict = [
        {
        'Random Forest Model': 'All Features: Basic Model',
        'Test RMSE': rmse_df_all_basic['RMSE Scores'].values[0]
    },
        {
        'Random Forest Model': 'Selected Features: Basic Model',
        'Test RMSE': rmse_df_selected_basic['RMSE Scores'].values[0]
    },
        {
        'Random Forest Model': 'All Features: Iteratively Trained on Top Features',
        'Test RMSE': rmse_df_all_train_top_features['RMSE_Scores'].values[-1]
    },
        {
        'Random Forest Model': 'Selected Features: Iteratively Trained on Top Features',
        'Test RMSE': rmse_df_selected_train_top_features['RMSE_Scores'].values[-1]
    },
        {
        'Random Forest Model': 'All Features: Grid Search CV',
        'Test RMSE': results_all_gs_rmse['RMSE Scores'].values[0]
    },
        {
        'Random Forest Model': 'Selected Features: Grid Search CV',
        'Test RMSE': results_selected_gs_rmse_scores['RMSE Scores'].values[0]
    },
        {
        'Random Forest Model': 'All Features: Forward Selection',
        'Test RMSE': results_all_fs_features_rmse_and_predictions['RMSE'].values[0]
    },
         {
        'Random Forest Model': 'Selected Features: Forward Selection',
        'Test RMSE': results_selected_fs_features_rmse_and_predictions['RMSE'].values[0]
    },
        {
        'Random Forest Model': 'All Features: Backward Elimination',
        'Test RMSE': results_all_be_features_rmse_and_predictions['RMSE'].values[0]
    },
        {
        'Random Forest Model': 'Selected Features: Backward Elimination',
        'Test RMSE': results_selected_be_features_rmse_and_predictions['RMSE'].values[0]
    }
]

rf_results_summary_df = pd.DataFrame(rf_results_summary_dict)
rf_results_summary_df['Test RMSE'] = rf_results_summary_df['Test RMSE'].round(3)
rf_results_summary_df

Unnamed: 0,Random Forest Model,Test RMSE
0,All Features: Basic Model,35.335
1,Selected Features: Basic Model,43.164
2,All Features: Iteratively Trained on Top Features,31.994
3,Selected Features: Iteratively Trained on Top ...,45.869
4,All Features: Grid Search CV,35.236
5,Selected Features: Grid Search CV,31.213
6,All Features: Forward Selection,113.434
7,Selected Features: Forward Selection,39.57
8,All Features: Backward Elimination,34.147
9,Selected Features: Backward Elimination,32.46


From our various Random Forest models, the model with the Grid Search Cross Validation on Selected Features has the lowest RMSE on test data. The feature importance data from this model is presented below:

In [None]:
results_selected_gs_features

Unnamed: 0,Feature,Score
0,totalAssets,0.129405
1,operatingCashFlowPerShare,0.126714
2,netIncome,0.09654
3,ebitda,0.093344
4,eps,0.092403
5,revenue,0.080148
6,freeCashFlowPerShare,0.070664
7,roic,0.046475
8,dividendsPaid,0.04511
9,^VIX,0.042469


In [None]:
top_10_features = results_selected_gs_features.head(10)
remaining_features = results_selected_gs_features.iloc[10:]

# Sum the scores of the remaining features
sum_of_remaining = remaining_features['Score'].sum()

# Create a new row for the sum of remaining features and label it 'Other'
other_row = pd.DataFrame(data={'Feature': ['Other'], 'Score': [sum_of_remaining]})

# Append this row to the top 10 features DataFrame
final_df = pd.concat([top_10_features, other_row], ignore_index=True)

fig = px.pie(final_df, values='Score', names='Feature',
             title='Top 10 Features and Scores',
             color_discrete_sequence=px.colors.qualitative.Pastel1)


fig.update_traces(textposition='outside', textinfo='label+percent',
                  hoverinfo='label+percent',
                  insidetextorientation='radial')


fig.update_layout(
    uniformtext_minsize=10,
    uniformtext_mode='hide',
    showlegend=False,
    autosize=False,
    width=1000,
    height=600,
    template='plotly_white'
)

# Create a list of features in "Other" and format it as a single string
other_features_list = remaining_features['Feature'].tolist()
other_features_text = "Other Features (17.7%):<br>" + "<br>".join(other_features_list)

# Add the annotation with the list of "Other" features on the right side of the chart
fig.add_annotation(
    text=other_features_text,
    align='left',
    showarrow=False,
    xref='paper',
    yref='paper',
    x=1.3,
    y=1.2
)

fig.show()

This output suggests that the model recognizes the company size (Total Assets) and the efficiency of its operations (Operating Cash Flow Per Share) to be the most significant factor in affecting the stock price of the following day. These features reflect the company's financial health. The following few features, Net Income, EBITDA and EPS are all key indicators of the company's earnings and profitability and are critical to investors. The output of this model aligns with domain knowledge and financial theory regarding the most significant factors to affect a company's stock behavior. Further analysis and interpretation will be discussed in the project report.

Below is an examination of this Randon Forest Model's predictive ability compared to a simple baseline model, an LSTM model with past prices, an LSTM model with all features and an LSTM model with the selected features. The actual price of MSFT on March 5th, 2024 was 402.65.

In [None]:
data = [
    {
        'Model': 'Simple Baseline Model',
        'Test RMSE': average_rmse_over_timeframe,
        'Predicted Price': simple_prediction[0],
        'Validation RMSE': simple_prediction[1]
    },
    {
        'Model': 'LSTM Log of Past Prices',
        'Test RMSE': lstm_past_prices_results['Average RMSE'].values[0],
        'Predicted Price': lstm_past_prices_results['1 Day Forecast'].values[0],
        'Validation RMSE': np.sqrt(mean_squared_error([402.65], [lstm_past_prices_results['1 Day Forecast'].values[0]]))
    },
        {
        'Model': 'LSTM All Features',
        'Test RMSE': lstm_all_features_results['Average RMSE'].values[0],
        'Predicted Price': lstm_all_features_results['Forecast March 5, 2024'].values[0],
        'Validation RMSE': np.sqrt(mean_squared_error([402.65], [lstm_all_features_results['Forecast March 5, 2024'].values[0]]))
    },
        {
        'Model': 'LSTM Selected Features',
        'Test RMSE': lstm_selected_features_results['Average RMSE'].values[0],
        'Predicted Price': lstm_selected_features_results['Forecast March 5, 2024'].values[0],
        'Validation RMSE': np.sqrt(mean_squared_error([402.65], [lstm_selected_features_results['Forecast March 5, 2024'].values[0]]))
    },
        {
        'Model': 'Selected Features: Grid Search CV',
        'Test RMSE': results_selected_gs_rmse_scores['RMSE Scores'].values[0],
        'Predicted Price': results_selected_gs_prediction['Prediction'].values[0],
        'Validation RMSE': np.sqrt(mean_squared_error([402.65], [results_selected_gs_prediction['Prediction'].values[0]]))
    }

]

results_summary_df = pd.DataFrame(data)
results_summary_df['Test RMSE'] = results_summary_df['Test RMSE'].round(3)
results_summary_df['Predicted Price'] = results_summary_df['Predicted Price'].round(3)
results_summary_df['Validation RMSE'] = results_summary_df['Validation RMSE'].round(3)
results_summary_df

Unnamed: 0,Model,Test RMSE,Predicted Price,Validation RMSE
0,Simple Baseline Model,3.659,415.394,12.744
1,LSTM Log of Past Prices,0.017,402.701,0.051
2,LSTM All Features,76.523,261.475,141.175
3,LSTM Selected Features,35.884,309.768,92.882
4,Selected Features: Grid Search CV,31.213,401.352,1.298


In [None]:
sorted_df = results_summary_df.sort_values('Predicted Price', ascending=False)


fig = go.Figure()

# Add bars for 'Predicted Price'
fig.add_trace(go.Bar(x=sorted_df['Model'], y=sorted_df['Predicted Price'],
                     name='Predicted Price', marker_color='#FFCB05',
                     text=sorted_df['Predicted Price'],
                     textposition='outside',
                     texttemplate='%{text:.2f}'))

# Add a horizontal line for the actual price
fig.add_hline(y=402.65, line_color="#00274C",
              annotation_text="Actual Price $402.65", annotation_position="top right",
              annotation_font_color="#00274C")

# Customize layout
fig.update_layout(
    title='1 Day Predicted Price by Model',
    xaxis_title='Model',
    yaxis_title='Predicted Price ($)',
    xaxis_tickangle=-20,
    template='plotly_white',
    yaxis=dict(range=[200, 500])
)

# Show the figure
fig.show()

The LSTM Model with Past Prices appears the most accurate both in testing and validation predictions. The Random Forest Grid Search Model with Selected Features also shows a reasonable performance on unseen data despite an elevated test RMSE. The remaining models do not appear to generalize well on unseen data, indicating poor predictive ability.