This is a continuation from the Data Processing notebook that starts with the Master DataFrame containing the combined dataset for this project.

# Imports and Installs

In [1]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m966.2 kB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [13]:
# Admin
import requests
from google.colab import files
from datetime import datetime
from tqdm import tqdm
from tqdm.notebook import tqdm
import time

import ast
import pandas as pd
import numpy as np
import yfinance as yf

from sklearn.model_selection import TimeSeriesSplit
from statsmodels.api import OLS, GLS, add_constant
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, make_scorer, mean_squared_error
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan
from scipy.stats import pearsonr
import scipy.stats as stats

# LSTM
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
import math
from sklearn.model_selection import train_test_split
import kerastuner
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from keras_tuner.tuners import RandomSearch
from kerastuner import HyperModel, RandomSearch
from keras_tuner import HyperParameters, Objective
import keras_tuner

# Random Forest
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Visualizations
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Functions

In [11]:
def download_stock_prices(ticker_symbol, start):
  """
  Downloads historical adjusted closing prices for a given stock ticker.

  Parameters:
  - ticker_symbol: The stock ticker symbol as a string.

  Returns:
  - prices_df: A DataFrame containing the adjusted closing prices.
  """
  start = start
  today_date = datetime.now().strftime('%Y-%m-%d')
  end = '2024-03-06'

  # Download historical data
  price_data = yf.download(ticker_symbol, start=start, end=end)

  # Extract adjusted closing prices
  adj_close_prices = price_data['Adj Close']

  # Create a DataFrame with the adjusted closing prices
  prices_df = pd.DataFrame(adj_close_prices)

  return prices_df

In [5]:
class LSTMHyperModel(HyperModel):
  def __init__(self, input_shape):
    self.input_shape = input_shape

  def build(self, hp):
    model = Sequential()
    num_lstm_layers = hp.Int('num_lstm_layers', min_value=1, max_value=6, step=1)

    for i in range(num_lstm_layers):
      if i == 0:
        # First layer needs to specify input shape
        model.add(LSTM(units=hp.Int(f'lstm_units_{i}', min_value=32, max_value=512, step=32),
                      input_shape=self.input_shape,
                      return_sequences=(i != num_lstm_layers - 1)))
      else:
        # Subsequent layers
        model.add(LSTM(units=hp.Int(f'lstm_units_{i}', min_value=32, max_value=512, step=32),
                      return_sequences=(i != num_lstm_layers - 1)))

      model.add(Dropout(rate=hp.Float(f'dropout_{i}', min_value=0.0, max_value=0.5, step=0.05)))

    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])
    return model

In [6]:
def lstm_model_with_tuning(df, target_column, feature_columns):
  if isinstance(feature_columns, str):
    feature_columns = [feature_columns]

  X = df[feature_columns].values
  y = df[target_column].values.reshape(-1, 1)
  num_features = len(feature_columns)
  input_shape = (1, num_features)

  tscv = TimeSeriesSplit(n_splits=4)
  rmse_scores = []

  for train_index, test_index in tscv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_train_scaled = scaler_X.fit_transform(X_train).reshape(-1, 1, num_features)
    X_test_scaled = scaler_X.transform(X_test).reshape(-1, 1, num_features)
    y_train_scaled = scaler_y.fit_transform(y_train)
    y_test_scaled = scaler_y.transform(y_test)

    model_builder = LSTMHyperModel(input_shape=input_shape)
    tuner = RandomSearch(model_builder,
                          objective='mse',
                          max_trials=15,
                          executions_per_trial=1,
                          directory='my_dir',
                          project_name='my_project')

    tuner.search(X_train_scaled, y_train_scaled, epochs=10, batch_size=64, validation_data=(X_test_scaled, y_test_scaled))
    best_model = tuner.get_best_models(num_models=1)[0]

    predictions_scaled = best_model.predict(X_test_scaled)
    predictions = scaler_y.inverse_transform(predictions_scaled)
    rmse = math.sqrt(mean_squared_error(y_test, predictions))
    rmse_scores.append(rmse)

  # Forecasting
  forecasts = []
  for i in range(1, 3):
    last_input = df[feature_columns].values[-i].reshape(1, -1)
    last_input_scaled = scaler_X.transform(last_input).reshape(-1, 1, num_features)
    forecast_scaled = best_model.predict(last_input_scaled)
    forecast = scaler_y.inverse_transform(forecast_scaled)
    forecasts.append(forecast[0][0])

  results_df = pd.DataFrame({
      'Average RMSE': [np.mean(rmse_scores)],
      'Forecast March 5, 2024': forecasts[1],
      'Forecast March 6, 2024': forecasts[0]
  })

  return results_df

# Download Data

## Stock Price Data

Data obtained from Yahoo! Finance using yfinance package

In [14]:
msft_df = download_stock_prices('MSFT', '2020-01-01')
msft_df.head()

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2020-01-02,154.49382
2020-01-03,152.570114
2020-01-06,152.964462
2020-01-07,151.569778
2020-01-08,153.984009


## Master DataFrame

In [10]:
master_df = pd.read_csv('master_df.csv')
master_df = pd.DataFrame(master_df)
master_df['Date'] = pd.to_datetime(master_df['Date'])
master_df = master_df.set_index('Date')

master_df['Shifted Adj Close'] = master_df['Adj Close'].shift(-1)

last_features_row = master_df.drop(columns=['Adj Close','Shifted Adj Close']).iloc[-1].values.reshape(1, -1)

master_df = master_df.dropna()
master_df.head()

Unnamed: 0_level_0,Adj Close,^VIX9D,^VIX,^VIX3M,^VIX6M,hv10,hv20,hv30,hv60,hv90,...,averageInventory,daysSalesOutstanding,daysPayablesOutstanding,daysOfInventoryOnHand,roe,capexPerShare,redditSentiment,AV Sentiment Score,finbert_news_score,Shifted Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02,154.49382,10.6,12.47,15.19,16.58,0.1334,0.1156,0.1043,0.0988,0.1347,...,2214613000.0,57.384287,64.356106,13.279729,0.105603,-0.465652,0.0,0.0,0.0,152.570114
2020-01-03,152.570114,12.78,14.02,16.01,17.16,0.1627,0.1315,0.117,0.1056,0.1363,...,2206726000.0,57.399854,64.544062,13.283037,0.105412,-0.466142,0.0,0.0,0.0,152.964462
2020-01-06,152.964462,12.42,13.85,15.92,17.08,0.1627,0.1287,0.117,0.1053,0.1349,...,2198839000.0,57.415421,64.732018,13.286346,0.10522,-0.466631,0.0,0.0,0.0,151.569794
2020-01-07,151.569794,12.34,13.79,15.86,17.02,0.166,0.1352,0.124,0.1084,0.1343,...,2190952000.0,57.430987,64.919974,13.289654,0.105028,-0.467121,0.0,0.0,0.0,153.98407
2020-01-08,153.98407,12.22,13.45,15.4,16.65,0.1947,0.1455,0.1283,0.1135,0.133,...,2183065000.0,57.446554,65.107929,13.292963,0.104836,-0.467611,0.0,0.0,0.0,155.907791


# Simple Baseline Model

According to financial theory, stock prices follow a random walk, meaning that the stock price for tomorrow equals today's price plus a random error term. This following model attempts to replicate that model and will be used as a baseline model against which to compare the other models in this project.

In [16]:
def simple_price_with_drift(start_price, end_price, iterations=1000, drift=0.001):
  drift = np.mean(msft_df['Adj Close'].pct_change().dropna()) # historic average daily return for MSFT
  new_prices = []
  rmses = []

  for _ in range(iterations):

    error_term = np.random.randn()
    new_price = start_price + start_price * drift + error_term

    # Calculate and store new price and RMSE
    rmse = np.sqrt(mean_squared_error([end_price], [new_price]))
    new_prices.append(new_price)
    rmses.append(rmse)

  # Calculate the average new price and average RMSE across all iterations
  average_new_price = round(np.mean(new_prices), 4)
  average_rmse = round(np.mean(rmses), 4)

  return average_new_price, average_rmse

In [17]:
rmse_full_timeframe = []
predicted_simple_price_full_timeframe = []
drift = np.mean(msft_df['Adj Close'].pct_change().dropna())
# Calculate for msft_df prices
for i in tqdm(range(1, len(msft_df))):
  start_price = msft_df['Adj Close'].iloc[i-1]
  end_price = msft_df['Adj Close'].iloc[i]
  new_price, average_rmse = simple_price_with_drift(start_price, end_price, 1000, drift)
  rmse_full_timeframe.append(average_rmse)
  predicted_simple_price_full_timeframe.append(new_price)

# Calculate the average RMSE across the entire timeframe
average_rmse_over_timeframe = np.mean(rmse_full_timeframe)

  0%|          | 0/1049 [00:00<?, ?it/s]

In [18]:
average_rmse_over_timeframe

3.6568136320305054

In [19]:
# simple price forecast for last day
simple_prediction = simple_price_with_drift(414.92, 402.65,1000)

In [20]:
msft_df_dropped_first = msft_df.iloc[1:].copy()
simple_model_df = msft_df_dropped_first
simple_model_df['Predicted Price'] = predicted_simple_price_full_timeframe

simple_model_df = simple_model_df.set_index(msft_df_dropped_first.index)

# Display the resulting DataFrame
simple_model_df.head()

Unnamed: 0_level_0,Adj Close,Predicted Price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-03,152.570114,154.6753
2020-01-06,152.964462,152.7764
2020-01-07,151.569778,153.1412
2020-01-08,153.984009,151.741
2020-01-09,155.907761,154.1665


In [24]:
simple_model_df_last_250 = simple_model_df.iloc[-250:] #250 trading days in 1 year

# Create a line plot for 'Adj Close'
fig = go.Figure()
fig.add_trace(go.Scatter(x=simple_model_df_last_250.index, y=simple_model_df_last_250['Adj Close'], mode='lines',
                         name='Actual Price', line=dict(color='#FFCB05')))

# Add a line plot for 'Predicted Price'
fig.add_trace(go.Scatter(x=simple_model_df_last_250.index, y=simple_model_df_last_250['Predicted Price'], mode='lines',
                         name='Predicted Price', line=dict(color='#00274C')))

# Update the layout
fig.update_layout(
    title='MSFT Actual Price vs Next Day Simple Predicted Price',
    xaxis_title='Date',
    yaxis_title='Price ($)',
    template='plotly_white'
)

fig.show()

# Random Forest Model (All Features)

## Basic Random Forest Model

In [None]:
X = master_df.drop(columns=['Adj Close','Shifted Adj Close'])
y = master_df['Shifted Adj Close']

# Initialize TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Placeholder list to store RMSE for each fold
rmse_scores = []

# Placeholder for feature importances
feature_importances = None

for train_index, test_index in tqdm(tscv.split(X), total=tscv.n_splits):
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

  model = RandomForestRegressor(n_estimators=10000, random_state=42)
  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  # Compute and store RMSE for the current fold
  rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))

  # Update feature importances - using the last split here for simplicity
  feature_importances = pd.Series(model.feature_importances_, index=X.columns)

# Average RMSE across folds
average_rmse = np.mean(rmse_scores)
print(f"Average RMSE: {average_rmse}")

100%|██████████| 5/5 [35:53<00:00, 430.64s/it]

Average RMSE: 35.334841778900966





In [None]:
sorted_feature_importances = feature_importances.sort_values(ascending=False)
top_ten_features = sorted_feature_importances[0:10]
top_ten_features

epsdiluted                           0.141138
netIncome                            0.140621
eps                                  0.135894
enterpriseValue                      0.087658
cashAtBeginningOfPeriod              0.043145
dividendPaidAndCapexCoverageRatio    0.011736
investedCapital                      0.009860
totalStockholdersEquity              0.009509
longTermDebtToCapitalization         0.009426
totalAssets                          0.009287
dtype: float64

In [None]:
next_day_prediction = model.predict(last_features_row)

print(f"Next day predicted 'Adj Close': {next_day_prediction[0]}")



Next day predicted 'Adj Close': 328.03809423217774


In [None]:
# save output

# RMSE score
rmse_df_all_basic = pd.DataFrame({'RMSE Scores': [average_rmse]})
rmse_df_all_basic.to_csv('results_all_basic_rmse_scores.csv', index=False)
files.download('results_all_basic_rmse_scores.csv')

# Feature importances
sorted_feature_importances.to_csv('results_all_basic_sorted_feature_importances.csv', index=True)
files.download('results_all_basic_sorted_feature_importances.csv')

# Price prediction
predictions_df = pd.DataFrame(next_day_prediction, columns=['Prediction'])
predictions_df.to_csv('results_all_basic_random_forest_prediction.csv', index=True)
files.download('results_all_basic_random_forest_prediction.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
rmse_df_all_basic = pd.read_csv('results_all_basic_rmse_scores.csv')
rmse_df_all_basic = pd.DataFrame(rmse_df_all_basic)

## Train Model with Top Features Obtained from Basic Model

In [None]:
def train_with_top_features_and_predict(X, y, top_features):
  """
  Trains models with an increasing number of top features and calculates RMSE for each model.

  Parameters:
  - X: DataFrame containing all features.
  - y: Series containing the target variable.
  - top_features: Series with sorted feature importances.

  Returns:
  - List of RMSE scores for models trained with incremental top features.
  """
  rmse_scores_by_feature_count = []
  final_model = None
  next_day_prediction = None

  # Initialize TimeSeriesSplit
  tscv = TimeSeriesSplit(n_splits=5)

  for i in tqdm(range(1, len(top_features) + 1)):
    # Select the top i features
    top_i_features = top_features.index[:i]
    X_top_i = X[top_i_features]

    rmse_scores = []  # Store RMSE for each fold

    for train_index, test_index in tscv.split(X_top_i):
      X_train, X_test = X_top_i.iloc[train_index], X_top_i.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]

      # Train the model
      model = RandomForestRegressor(n_estimators=7000, random_state=42)
      model.fit(X_train, y_train)

      y_pred = model.predict(X_test)

      # Compute and store RMSE for the current fold
      rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))

    # Average RMSE across folds for the current number of top features
    average_rmse = np.mean(rmse_scores)
    rmse_scores_by_feature_count.append(average_rmse)

    # Keep the last model and its features for next day prediction
    if i == len(top_features):
      final_model = model
      final_features = top_i_features

  # Train the final model on the latest data using selected top features
  X_train_final = X[final_features]
  y_train_final = y
  final_model.fit(X_train_final, y_train_final)

  # Prepare the last row from the dataset for prediction
  last_features_row = X[final_features].iloc[-1:].values
  next_day_prediction = final_model.predict(last_features_row)

  return rmse_scores_by_feature_count, next_day_prediction[0], final_model

In [None]:
top_ten_features = pd.read_csv('results_all_basic_sorted_feature_importances.csv')
top_ten_features = pd.DataFrame(top_ten_features).set_index('Unnamed: 0')
top_ten_features = top_ten_features.iloc[:10]

In [None]:
df = master_df
X = df.drop(columns=['Adj Close', 'Shifted Adj Close'])
y = df['Shifted Adj Close']

rmse_scores_list, next_day_pred, model = train_with_top_features_and_predict(X, y, top_ten_features)

print("RMSE scores by number of features used:", rmse_scores_list)
print("Predicted price for the next day:", next_day_pred)

In [None]:
num_top_features = list(range(1, len(top_ten_features) + 1))

rmse_df_all_train_top_features = pd.DataFrame({
    'Num_Top_Features': num_top_features,
    'RMSE_Scores': rmse_scores_list
})

# Add the next_day_pred as a new column (repeating the value for all rows for demonstration)
rmse_df_all_train_top_features['Next_Day_Prediction based on final model'] = next_day_pred

rmse_df_all_train_top_features.to_csv('results_all_train_top_features_rmse_and_prediction.csv', index=False)
files.download('results_all_train_top_features_rmse_and_prediction.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# the following file has the test RMSE and prediction for the model iteratively trained on top features from the basic model
rmse_df_all_train_top_features = pd.read_csv('results_all_train_top_features_rmse_and_prediction.csv')
rmse_df_all_train_top_features = pd.DataFrame(rmse_df_all_train_top_features)

## Random Forest Model with Grid Search CV to get Top Features

In [None]:
# Model with GridSearchCV
X = master_df.drop(columns=['Adj Close','Shifted Adj Close'])
y = master_df['Shifted Adj Close']

# Define the parameter grid to search
param_grid = {
    'n_estimators': [1500, 3000, 6000],  # List of numbers of trees
    'min_samples_split': [2, 12, 20], # Minimum number of samples required to split a node
}

model = RandomForestRegressor(random_state=42)

tscv = TimeSeriesSplit(n_splits=5)

rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=tscv, scoring=rmse_scorer, verbose=1, n_jobs=-1)

grid_search.fit(X, y)

best_model = grid_search.best_estimator_

feature_importances = pd.Series(best_model.feature_importances_, index=X.columns)

# Sort feature importances in descending order and select top 10
sorted_feature_importances = feature_importances.sort_values(ascending=False)
top_ten_features = sorted_feature_importances[:10]

last_features_row = X.iloc[-1].values.reshape(1, -1)

next_day_prediction = best_model.predict(last_features_row)

print("Best parameters:", grid_search.best_params_)
print("Best RMSE score:", -grid_search.best_score_)
print(f"Top 10 features:\n{top_ten_features}")
print("Next day prediction:", next_day_prediction[0])

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters: {'min_samples_split': 12, 'n_estimators': 3000}
Best RMSE score: 35.23601112757803
Top 10 features:
enterpriseValue                       0.646652
^VIX6M                                0.046765
^VIX3M                                0.023085
capitalExpenditureCoverageRatio       0.015604
netReceivables                        0.008085
freeCashFlowOperatingCashFlowRatio    0.006726
capexToOperatingCashFlow              0.006628
hv150                                 0.005411
hv120                                 0.005251
totalEquity                           0.005165
dtype: float64
Next day prediction: 405.9340169863772




In [None]:
# RMSE score
rmse_scores = -grid_search.best_score_
rmse_df = pd.DataFrame({'RMSE Scores': [rmse_scores]})
rmse_df.to_csv('rmse_scores.csv', index=False)
files.download('rmse_scores.csv')

# Feature importances
sorted_feature_importances.to_csv('sorted_feature_importances.csv', index=True)
files.download('sorted_feature_importances.csv')

# Price prediction
predictions_df = pd.DataFrame(next_day_prediction, columns=['Prediction'])
predictions_df.to_csv('random_forest_prediction.csv', index=True)
files.download('random_forest_prediction.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# the following file has the test RMSE and prediction for the RF model with Grid Search CV using all features
results_all_gs_rmse = pd.read_csv('results_all_gs_random_forest_prediction_and_rmse_scores.csv')
results_all_gs_rmse = pd.DataFrame(results_all_gs_rmse)

## Random Forest with Forward Selection

In [None]:
X = master_df.drop(columns=['Adj Close','Shifted Adj Close'])
y = master_df['Shifted Adj Close']

# Split the data into training and testing sets
tscv = TimeSeriesSplit(n_splits=5)

# List to keep track of selected features and performance
selected_features = []
remaining_features = list(X_train.columns)
best_score = float('inf')

while remaining_features and len(selected_features) < 10:
  score_improvement = False
  for feature in remaining_features:
    # Test adding the current feature
    trial_features = selected_features + [feature]
    trial_X = X[trial_features]

    # Cross-validate the model with the current set of features
    rmse_scores = []
    for train_index, test_index in tscv.split(trial_X):
      X_train, X_test = trial_X.iloc[train_index], trial_X.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]

      model = RandomForestRegressor(n_estimators=10000, random_state=42)
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)
      rmse_score = np.sqrt(mean_squared_error(y_test, y_pred))
      rmse_scores.append(rmse_score)

    # Calculate the average RMSE across time-series splits
    average_rmse = np.mean(rmse_scores)

    # Check if the RMSE has improved
    if average_rmse < best_score:
      best_score = average_rmse
      best_feature = feature
      score_improvement = True

  # If the feature improves the model, add it to the selected features
  if score_improvement:
    selected_features.append(best_feature)
    remaining_features.remove(best_feature)
  else:
    # If no improvement, exit the while loop
    break

print("Selected features:", selected_features)
print("Best RMSE Score:", best_score)

# Train the final model on the selected features
# Use the last split as the hold-out test set
train_index, test_index = list(tscv.split(X[selected_features]))[-1]
X_train, X_test = X[selected_features].iloc[train_index], X[selected_features].iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

final_model = RandomForestRegressor(n_estimators=100, random_state=42)
final_model.fit(X_train, y_train)
final_predictions = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, final_predictions))

# Predict the next day (using the last available features)
last_features_row = X[selected_features].iloc[-1].values.reshape(1, -1)
next_day_prediction = final_model.predict(last_features_row)

print("Final RMSE on the hold-out test set:", final_rmse)
print("Next day prediction:", next_day_prediction[0])

Selected features: ['hv60']
Best RMSE Score: 80.74162455339832
Final RMSE on the hold-out test set: 113.43419127620191
Next day prediction: 284.4680347442627




In [None]:
results_df = pd.DataFrame({
    'Selected_Features': selected_features,
    'RMSE': final_rmse,
    'Next_Day_Prediction': next_day_prediction[0]
})
results_all_fs_features_rmse_and_predictions = results_df
results_all_fs_features_rmse_and_predictions.to_csv('results_all_fs_features_rmse_and_predictions.csv', index=False)
files.download('results_all_fs_features_rmse_and_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
results_all_fs_features_rmse_and_predictions = pd.read_csv('results_all_fs_features_rmse_and_predictions.csv')
results_all_fs_features_rmse_and_predictions = pd.DataFrame(results_all_fs_features_rmse_and_predictions)

## Random Forest with Backward Elimination

In [None]:
def rf_backward_elimination(X, y):
  features = X.columns.tolist()
  best_rmse = float('inf')
  best_features = features.copy()
  improvement = True

  # Initialize TimeSeriesSplit
  tscv = TimeSeriesSplit(n_splits=5)

  while improvement and features:
    improvement = False
    feature_rmse = []

    for feature in tqdm(features, desc="Evaluating features"):
      # Indicate which feature is being processed
      print(f"Processing {feature}...")
      trial_features = [f for f in features if f != feature]
      trial_X = X[trial_features]

      # Store RMSE for each fold
      fold_rmse = []

      for train_index, test_index in tscv.split(trial_X):
        X_train, X_test = trial_X.iloc[train_index], trial_X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        fold_rmse.append(rmse)

      # Calculate average RMSE across folds for current feature set
      average_rmse = np.mean(fold_rmse)
      feature_rmse.append((feature, average_rmse))

    # Find the feature whose removal gives the best average RMSE
    feature_rmse.sort(key=lambda x: x[1])
    if feature_rmse and feature_rmse[0][1] < best_rmse:
      best_rmse = feature_rmse[0][1]
      features.remove(feature_rmse[0][0])
      best_features = features.copy()
      improvement = True
      print(f"Removed {feature_rmse[0][0]}, Best RMSE: {best_rmse}")
    else:
      print("No further improvement.")
  final_X = X[best_features]
  final_model = RandomForestRegressor(n_estimators=1000, random_state=42)
  final_model.fit(final_X, y)

  # Make a one-day prediction using the last available data point
  last_features_row = final_X.iloc[-1:].values
  next_day_prediction = final_model.predict(last_features_row)

  return best_features, best_rmse, next_day_prediction[0]

In [None]:
results_df = pd.DataFrame({
    'Best_Features': best_features,
    'RMSE': best_rmse,
    'Next_Day_Prediction': one_day_pred
})
results_all_be_features_rmse_and_predictions = results_df
results_all_be_features_rmse_and_predictions.to_csv('results_all_be_features_rmse_and_predictions.csv', index=True)
files.download('results_all_be_features_rmse_and_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
results_all_be_features_rmse_and_predictions = pd.read_csv('results_all_be_features_rmse_and_predictions.csv')
results_all_be_features_rmse_and_predictions = pd.DataFrame(results_all_be_features_rmse_and_predictions)

# Random Forest Model Discretionary Selected Features

## Basic Random Forest Model

In [None]:
df = master_df[['Shifted Adj Close','dividendsPaid', '^VIX', 'ivmean30', 'ivput270', 'Grade', 'weightedScore', 'revenue', 'netIncome',
              'totalAssets', 'eps', 'operatingCashFlowPerShare', 'freeCashFlowPerShare','Surprise', 'congress_net_trade',
               'Sentiment', 'stocktwitsSentiment', 'twitterSentiment', 'news_sentimentScore', 'Dividend_y',
                'netProfitMargin', 'returnOnEquity', 'ebitda', 'roic']]
df = df[253:]
last_features_row = df.drop(columns=['Shifted Adj Close']).iloc[-1].values.reshape(1, -1)

In [None]:
X = df.drop(columns=['Shifted Adj Close'])
y = df['Shifted Adj Close']
tscv = TimeSeriesSplit(n_splits=5)

rmse_scores = []

feature_importances = None

for train_index, test_index in tqdm(tscv.split(X), total=tscv.n_splits):
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

  model = RandomForestRegressor(n_estimators=10000, random_state=42)
  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))

  feature_importances = pd.Series(model.feature_importances_, index=X.columns)

average_rmse = np.mean(rmse_scores)
print(f"Average RMSE: {average_rmse}")


100%|██████████| 5/5 [02:33<00:00, 30.79s/it]

Average RMSE: 43.1642011637954





In [None]:
sorted_feature_importances = feature_importances.sort_values(ascending=False)
top_eight_features = sorted_feature_importances[0:8]
top_eight_features

eps                 0.220288
netIncome           0.218502
twitterSentiment    0.215219
netProfitMargin     0.106305
totalAssets         0.071295
returnOnEquity      0.031568
ebitda              0.023007
revenue             0.022562
dtype: float64

In [None]:
next_day_prediction = model.predict((last_features_row))

print(f"Next day predicted 'Adj Close': {next_day_prediction[0]}")



Next day predicted 'Adj Close': 331.25040352478027


In [None]:
rmse_df_selected_basic = pd.DataFrame({'RMSE Scores': [average_rmse]})
rmse_df_selected_basic.to_csv('results_selected_basic_rmse_scores.csv', index=False)
files.download('results_selected_basic_rmse_scores.csv')

# Feature importances
sorted_feature_importances.to_csv('results_selected_basic_sorted_feature_importances.csv', index=True)
files.download('results_selected_basic_sorted_feature_importances.csv')

# Price prediction
predictions_df = pd.DataFrame(next_day_prediction, columns=['Prediction'])
predictions_df.to_csv('results_selected_basic_random_forest_prediction.csv', index=True)
files.download('results_selected_basic_random_forest_prediction.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
rmse_df_selected_basic = pd.read_csv('results_selected_basic_rmse_scores.csv')
rmse_df_selected_basic = pd.DataFrame(rmse_df_selected_basic)

## Train with Top Features

In [None]:
selected_train_top_features_rmse_scores_list = train_with_top_features_and_predict(X, y, top_eight_features)

100%|██████████| 8/8 [06:43<00:00, 50.46s/it]


In [None]:
print("RMSE scores for models with incremental top features:", selected_train_top_features_rmse_scores_list)

RMSE scores for models with incremental top features: ([38.43422975359275, 38.26421569345608, 34.40223457698411, 37.47378480587942, 43.51139570894516, 43.836223845872134, 45.743598893183304, 45.86943292878883], 400.7212232870456, RandomForestRegressor(n_estimators=7000, random_state=42))


In [None]:
num_top_features = list(range(1, len(top_eight_features) + 1))

# Create a DataFrame for RMSE scores
rmse_df_selected_train_top_features = pd.DataFrame({
    'Num_Top_Features': num_top_features,
    'RMSE_Scores': selected_train_top_features_rmse_scores_list[0]
})

# Add the next_day_pred as a new column (repeating the value for all rows for demonstration)
rmse_df_selected_train_top_features['Next_Day_Prediction based on final model'] = next_day_pred

rmse_df_selected_train_top_features.to_csv('results_selected_train_top_features_rmse_and_prediction.csv', index=False)
files.download('results_selected_train_top_features_rmse_and_prediction.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
rmse_df_selected_train_top_features = pd.read_csv('results_selected_train_top_features_rmse_and_prediction.csv')
rmse_df_selected_train_top_features = pd.DataFrame(rmse_df_selected_train_top_features)

## Random Forest With Grid Search CV to get Top Features

In [None]:
# Model with GridSearchCV
X = df.drop(columns=['Shifted Adj Close'])
y = df['Shifted Adj Close']

# Define the parameter grid to search
param_grid = {
    'n_estimators': [1500, 3000, 6000],  # List of numbers of trees
    'max_depth': [None, 30, 120],  # Maximum depth of trees, including 'None' for full growth
    'min_samples_split': [2, 12, 20], # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 3, 8],    # Minimum number of samples required at each leaf node
    'max_features': ['auto', 'sqrt', 40]  # Number of features to consider at every split
}

# Initialize the model
model = RandomForestRegressor(random_state=42)

tscv = TimeSeriesSplit(n_splits=5)

# Define RMSE as the scoring criteria
rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

# GridSearchCV with TimeSeriesSplit cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=tscv, scoring=rmse_scorer, verbose=1, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X, y)

# Access the best estimator
best_model = grid_search.best_estimator_

# Feature importances from the best model
feature_importances = pd.Series(best_model.feature_importances_, index=X.columns)

# Sort feature importances in descending order and select top 10
sorted_feature_importances = feature_importances.sort_values(ascending=False)
top_ten_features = sorted_feature_importances[:10]

last_features_row = X.iloc[-1].values.reshape(1, -1)

next_day_prediction = best_model.predict(last_features_row)

# Print the results
print("Best parameters:", grid_search.best_params_)
print("Best RMSE score:", -grid_search.best_score_)
print(f"Top 10 features:\n{top_ten_features}")
print("Next day prediction:", next_day_prediction[0])

In [None]:
# RMSE score
rmse_scores = -grid_search.best_score_
rmse_df = pd.DataFrame({'RMSE Scores': [rmse_scores]})
rmse_df.to_csv('results_selected_gs_rmse_scores.csv', index=False)
files.download('results_selected_gs_rmse_scores.csv')

# Feature importances
sorted_feature_importances.to_csv('results_selected_gs_sorted_feature_importances.csv', index=True)
files.download('results_selected_gs_sorted_feature_importances.csv')

# Price prediction
predictions_df = pd.DataFrame(next_day_prediction, columns=['Prediction'])
predictions_df.to_csv('results_selected_gs_prediction.csv', index=True)
files.download('results_selected_gs_prediction.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
results_selected_gs_rmse_scores = pd.read_csv('results_selected_gs_rmse_scores.csv')
results_selected_gs_rmse_scores = pd.DataFrame(results_selected_gs_rmse_scores)

In [None]:
results_selected_gs_features = pd.read_csv('results_selected_gs_sorted_feature_importances.csv')
results_selected_gs_features = pd.DataFrame(results_selected_gs_features)

In [None]:
results_selected_gs_prediction = pd.read_csv('results_selected_gs_prediction.csv')
results_selected_gs_prediction = pd.DataFrame(results_selected_gs_prediction)

## Random Forest with Forward Selection

In [None]:
X = df.drop(columns=['Shifted Adj Close'])
y = df['Shifted Adj Close']

# Split the data into training and testing sets
tscv = TimeSeriesSplit(n_splits=5)

# List to keep track of selected features and performance
selected_features = []
remaining_features = list(X_train.columns)
best_score = float('inf')

while remaining_features and len(selected_features) < 8:
  score_improvement = False
  for feature in remaining_features:
    # Test adding the current feature
    trial_features = selected_features + [feature]
    trial_X = X[trial_features]

    # Cross-validate the model with the current set of features
    rmse_scores = []
    for train_index, test_index in tscv.split(trial_X):
      X_train, X_test = trial_X.iloc[train_index], trial_X.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]

      model = RandomForestRegressor(n_estimators=1000, random_state=42)
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)
      rmse_score = np.sqrt(mean_squared_error(y_test, y_pred))
      rmse_scores.append(rmse_score)

    # Calculate the average RMSE across time-series splits
    average_rmse = np.mean(rmse_scores)

    # Check if we've improved the RMSE
    if average_rmse < best_score:
      best_score = average_rmse
      best_feature = feature
      score_improvement = True

  # If a feature improves the model, add it to the selected features
  if score_improvement:
    selected_features.append(best_feature)
    remaining_features.remove(best_feature)
  else:
    # If no improvement, exit the while loop
    break

print("Selected features:", selected_features)
print("Best RMSE Score:", best_score)

# Train the final model on the selected features
# Use the last split as the hold-out test set
train_index, test_index = list(tscv.split(X[selected_features]))[-1]
X_train, X_test = X[selected_features].iloc[train_index], X[selected_features].iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

final_model = RandomForestRegressor(n_estimators=100, random_state=42)
final_model.fit(X_train, y_train)
final_predictions = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, final_predictions))

# Predict the next day (using the last available features)
last_features_row = X[selected_features].iloc[-1].values.reshape(1, -1)
next_day_prediction = final_model.predict(last_features_row)

print("Final RMSE on the hold-out test set:", final_rmse)
print("Next day prediction:", next_day_prediction[0])

In [None]:
results_df = pd.DataFrame({
    'Selected_Features': selected_features,
    'RMSE': final_rmse,
    'Next_Day_Prediction': next_day_prediction[0]
})
results_selected_fs_features_rmse_and_predictions = results_df
results_selected_fs_features_rmse_and_predictions.to_csv('results_selected_fs_features_rmse_and_predictions.csv', index=True)
files.download('results_selected_fs_features_rmse_and_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
results_selected_fs_features_rmse_and_predictions = pd.read_csv('results_selected_fs_features_rmse_and_predictions.csv')
results_selected_fs_features_rmse_and_predictions = pd.DataFrame(results_selected_fs_features_rmse_and_predictions)

## Random Forest with Backward Elimination

In [None]:
X = df.drop(columns=['Shifted Adj Close'])
y = df['Shifted Adj Close']
best_features, best_rmse, one_day_pred = rf_backward_elimination(X, y)

print("Selected features:", best_features)
print("Final RMSE on the hold-out test set:", best_rmse)
print("Next day prediction:", one_day_pred)


Evaluating features:   0%|          | 0/23 [00:00<?, ?it/s]

Processing dividendsPaid...


Evaluating features:   4%|▍         | 1/23 [00:01<00:36,  1.68s/it]

Processing ^VIX...


Evaluating features:   9%|▊         | 2/23 [00:03<00:33,  1.59s/it]

Processing ivmean30...


Evaluating features:  13%|█▎        | 3/23 [00:04<00:30,  1.55s/it]

Processing ivput270...


Evaluating features:  17%|█▋        | 4/23 [00:06<00:28,  1.51s/it]

Processing Grade...


Evaluating features:  22%|██▏       | 5/23 [00:07<00:27,  1.52s/it]

Processing weightedScore...


Evaluating features:  26%|██▌       | 6/23 [00:09<00:25,  1.52s/it]

Processing revenue...


Evaluating features:  30%|███       | 7/23 [00:10<00:24,  1.51s/it]

Processing netIncome...


Evaluating features:  35%|███▍      | 8/23 [00:12<00:22,  1.51s/it]

Processing totalAssets...


Evaluating features:  39%|███▉      | 9/23 [00:13<00:20,  1.49s/it]

Processing eps...


Evaluating features:  43%|████▎     | 10/23 [00:15<00:19,  1.50s/it]

Processing operatingCashFlowPerShare...


Evaluating features:  48%|████▊     | 11/23 [00:16<00:17,  1.49s/it]

Processing freeCashFlowPerShare...


Evaluating features:  52%|█████▏    | 12/23 [00:18<00:16,  1.47s/it]

Processing Surprise...


Evaluating features:  57%|█████▋    | 13/23 [00:19<00:14,  1.48s/it]

Processing congress_net_trade...


Evaluating features:  61%|██████    | 14/23 [00:21<00:13,  1.49s/it]

Processing Sentiment...


Evaluating features:  65%|██████▌   | 15/23 [00:22<00:12,  1.50s/it]

Processing stocktwitsSentiment...


Evaluating features:  70%|██████▉   | 16/23 [00:24<00:10,  1.49s/it]

Processing twitterSentiment...


Evaluating features:  74%|███████▍  | 17/23 [00:25<00:08,  1.50s/it]

Processing news_sentimentScore...


Evaluating features:  78%|███████▊  | 18/23 [00:27<00:07,  1.50s/it]

Processing Dividend_y...


Evaluating features:  83%|████████▎ | 19/23 [00:28<00:06,  1.51s/it]

Processing netProfitMargin...


Evaluating features:  87%|████████▋ | 20/23 [00:30<00:04,  1.49s/it]

Processing returnOnEquity...


Evaluating features:  91%|█████████▏| 21/23 [00:31<00:02,  1.48s/it]

Processing ebitda...


Evaluating features:  96%|█████████▌| 22/23 [00:33<00:01,  1.50s/it]

Processing roic...


Evaluating features: 100%|██████████| 23/23 [00:34<00:00,  1.50s/it]


Removed netProfitMargin, Best RMSE: 37.28028085855479


Evaluating features:   0%|          | 0/22 [00:00<?, ?it/s]

Processing dividendsPaid...


Evaluating features:   5%|▍         | 1/22 [00:01<00:29,  1.42s/it]

Processing ^VIX...


Evaluating features:   9%|▉         | 2/22 [00:02<00:27,  1.40s/it]

Processing ivmean30...


Evaluating features:  14%|█▎        | 3/22 [00:04<00:26,  1.39s/it]

Processing ivput270...


Evaluating features:  18%|█▊        | 4/22 [00:05<00:24,  1.38s/it]

Processing Grade...


Evaluating features:  23%|██▎       | 5/22 [00:06<00:23,  1.39s/it]

Processing weightedScore...


Evaluating features:  27%|██▋       | 6/22 [00:08<00:22,  1.40s/it]

Processing revenue...


Evaluating features:  32%|███▏      | 7/22 [00:09<00:21,  1.40s/it]

Processing netIncome...


Evaluating features:  36%|███▋      | 8/22 [00:11<00:19,  1.40s/it]

Processing totalAssets...


Evaluating features:  41%|████      | 9/22 [00:12<00:18,  1.39s/it]

Processing eps...


Evaluating features:  45%|████▌     | 10/22 [00:13<00:16,  1.39s/it]

Processing operatingCashFlowPerShare...


Evaluating features:  50%|█████     | 11/22 [00:15<00:15,  1.39s/it]

Processing freeCashFlowPerShare...


Evaluating features:  55%|█████▍    | 12/22 [00:16<00:13,  1.38s/it]

Processing Surprise...


Evaluating features:  59%|█████▉    | 13/22 [00:18<00:12,  1.39s/it]

Processing congress_net_trade...


Evaluating features:  64%|██████▎   | 14/22 [00:19<00:11,  1.39s/it]

Processing Sentiment...


Evaluating features:  68%|██████▊   | 15/22 [00:20<00:09,  1.40s/it]

Processing stocktwitsSentiment...


Evaluating features:  73%|███████▎  | 16/22 [00:22<00:08,  1.39s/it]

Processing twitterSentiment...


Evaluating features:  77%|███████▋  | 17/22 [00:23<00:07,  1.40s/it]

Processing news_sentimentScore...


Evaluating features:  82%|████████▏ | 18/22 [00:25<00:05,  1.40s/it]

Processing Dividend_y...


Evaluating features:  86%|████████▋ | 19/22 [00:26<00:04,  1.42s/it]

Processing returnOnEquity...


Evaluating features:  91%|█████████ | 20/22 [00:27<00:02,  1.41s/it]

Processing ebitda...


Evaluating features:  95%|█████████▌| 21/22 [00:29<00:01,  1.41s/it]

Processing roic...


Evaluating features: 100%|██████████| 22/22 [00:30<00:00,  1.40s/it]


Removed dividendsPaid, Best RMSE: 34.73731918510917


Evaluating features:   0%|          | 0/21 [00:00<?, ?it/s]

Processing ^VIX...


Evaluating features:   5%|▍         | 1/21 [00:01<00:26,  1.32s/it]

Processing ivmean30...


Evaluating features:  10%|▉         | 2/21 [00:02<00:25,  1.32s/it]

Processing ivput270...


Evaluating features:  14%|█▍        | 3/21 [00:03<00:23,  1.32s/it]

Processing Grade...


Evaluating features:  19%|█▉        | 4/21 [00:05<00:23,  1.36s/it]

Processing weightedScore...


Evaluating features:  24%|██▍       | 5/21 [00:06<00:21,  1.37s/it]

Processing revenue...


Evaluating features:  29%|██▊       | 6/21 [00:08<00:20,  1.38s/it]

Processing netIncome...


Evaluating features:  33%|███▎      | 7/21 [00:09<00:19,  1.38s/it]

Processing totalAssets...


Evaluating features:  38%|███▊      | 8/21 [00:10<00:17,  1.37s/it]

Processing eps...


Evaluating features:  43%|████▎     | 9/21 [00:12<00:16,  1.37s/it]

Processing operatingCashFlowPerShare...


Evaluating features:  48%|████▊     | 10/21 [00:13<00:15,  1.37s/it]

Processing freeCashFlowPerShare...


Evaluating features:  52%|█████▏    | 11/21 [00:14<00:13,  1.36s/it]

Processing Surprise...


Evaluating features:  57%|█████▋    | 12/21 [00:16<00:12,  1.37s/it]

Processing congress_net_trade...


Evaluating features:  62%|██████▏   | 13/21 [00:17<00:10,  1.37s/it]

Processing Sentiment...


Evaluating features:  67%|██████▋   | 14/21 [00:19<00:09,  1.39s/it]

Processing stocktwitsSentiment...


Evaluating features:  71%|███████▏  | 15/21 [00:20<00:08,  1.38s/it]

Processing twitterSentiment...


Evaluating features:  76%|███████▌  | 16/21 [00:21<00:06,  1.38s/it]

Processing news_sentimentScore...


Evaluating features:  81%|████████  | 17/21 [00:23<00:05,  1.38s/it]

Processing Dividend_y...


Evaluating features:  86%|████████▌ | 18/21 [00:24<00:04,  1.39s/it]

Processing returnOnEquity...


Evaluating features:  90%|█████████ | 19/21 [00:26<00:02,  1.39s/it]

Processing ebitda...


Evaluating features:  95%|█████████▌| 20/21 [00:27<00:01,  1.40s/it]

Processing roic...


Evaluating features: 100%|██████████| 21/21 [00:28<00:00,  1.38s/it]


Removed operatingCashFlowPerShare, Best RMSE: 34.0870186097859


Evaluating features:   0%|          | 0/20 [00:00<?, ?it/s]

Processing ^VIX...


Evaluating features:   5%|▌         | 1/20 [00:01<00:24,  1.28s/it]

Processing ivmean30...


Evaluating features:  10%|█         | 2/20 [00:02<00:22,  1.27s/it]

Processing ivput270...


Evaluating features:  15%|█▌        | 3/20 [00:03<00:21,  1.28s/it]

Processing Grade...


Evaluating features:  20%|██        | 4/20 [00:05<00:20,  1.30s/it]

Processing weightedScore...


Evaluating features:  25%|██▌       | 5/20 [00:06<00:19,  1.31s/it]

Processing revenue...


Evaluating features:  30%|███       | 6/20 [00:07<00:18,  1.32s/it]

Processing netIncome...


Evaluating features:  35%|███▌      | 7/20 [00:09<00:17,  1.33s/it]

Processing totalAssets...


Evaluating features:  40%|████      | 8/20 [00:10<00:15,  1.32s/it]

Processing eps...


Evaluating features:  45%|████▌     | 9/20 [00:11<00:14,  1.33s/it]

Processing freeCashFlowPerShare...


Evaluating features:  50%|█████     | 10/20 [00:13<00:13,  1.32s/it]

Processing Surprise...


Evaluating features:  55%|█████▌    | 11/20 [00:14<00:11,  1.32s/it]

Processing congress_net_trade...


Evaluating features:  60%|██████    | 12/20 [00:15<00:10,  1.32s/it]

Processing Sentiment...


Evaluating features:  65%|██████▌   | 13/20 [00:17<00:09,  1.33s/it]

Processing stocktwitsSentiment...


Evaluating features:  70%|███████   | 14/20 [00:18<00:07,  1.32s/it]

Processing twitterSentiment...


Evaluating features:  75%|███████▌  | 15/20 [00:19<00:06,  1.32s/it]

Processing news_sentimentScore...


Evaluating features:  80%|████████  | 16/20 [00:21<00:05,  1.33s/it]

Processing Dividend_y...


Evaluating features:  85%|████████▌ | 17/20 [00:22<00:03,  1.33s/it]

Processing returnOnEquity...


Evaluating features:  90%|█████████ | 18/20 [00:23<00:02,  1.33s/it]

Processing ebitda...


Evaluating features:  95%|█████████▌| 19/20 [00:25<00:01,  1.33s/it]

Processing roic...


Evaluating features: 100%|██████████| 20/20 [00:26<00:00,  1.32s/it]


Removed totalAssets, Best RMSE: 33.46828890726362


Evaluating features:   0%|          | 0/19 [00:00<?, ?it/s]

Processing ^VIX...


Evaluating features:   5%|▌         | 1/19 [00:01<00:21,  1.20s/it]

Processing ivmean30...


Evaluating features:  11%|█         | 2/19 [00:02<00:20,  1.22s/it]

Processing ivput270...


Evaluating features:  16%|█▌        | 3/19 [00:03<00:19,  1.22s/it]

Processing Grade...


Evaluating features:  21%|██        | 4/19 [00:04<00:18,  1.23s/it]

Processing weightedScore...


Evaluating features:  26%|██▋       | 5/19 [00:06<00:17,  1.24s/it]

Processing revenue...


Evaluating features:  32%|███▏      | 6/19 [00:07<00:16,  1.24s/it]

Processing netIncome...


Evaluating features:  37%|███▋      | 7/19 [00:08<00:15,  1.26s/it]

Processing eps...


Evaluating features:  42%|████▏     | 8/19 [00:09<00:13,  1.27s/it]

Processing freeCashFlowPerShare...


Evaluating features:  47%|████▋     | 9/19 [00:11<00:12,  1.25s/it]

Processing Surprise...


Evaluating features:  53%|█████▎    | 10/19 [00:12<00:11,  1.26s/it]

Processing congress_net_trade...


Evaluating features:  58%|█████▊    | 11/19 [00:13<00:10,  1.26s/it]

Processing Sentiment...


Evaluating features:  63%|██████▎   | 12/19 [00:15<00:08,  1.26s/it]

Processing stocktwitsSentiment...


Evaluating features:  68%|██████▊   | 13/19 [00:16<00:07,  1.26s/it]

Processing twitterSentiment...


Evaluating features:  74%|███████▎  | 14/19 [00:17<00:06,  1.26s/it]

Processing news_sentimentScore...


Evaluating features:  79%|███████▉  | 15/19 [00:18<00:05,  1.26s/it]

Processing Dividend_y...


Evaluating features:  84%|████████▍ | 16/19 [00:20<00:03,  1.27s/it]

Processing returnOnEquity...


Evaluating features:  89%|████████▉ | 17/19 [00:21<00:02,  1.25s/it]

Processing ebitda...


Evaluating features:  95%|█████████▍| 18/19 [00:22<00:01,  1.25s/it]

Processing roic...


Evaluating features: 100%|██████████| 19/19 [00:23<00:00,  1.25s/it]


Removed ebitda, Best RMSE: 33.003652675849416


Evaluating features:   0%|          | 0/18 [00:00<?, ?it/s]

Processing ^VIX...


Evaluating features:   6%|▌         | 1/18 [00:01<00:20,  1.19s/it]

Processing ivmean30...


Evaluating features:  11%|█         | 2/18 [00:02<00:18,  1.18s/it]

Processing ivput270...


Evaluating features:  17%|█▋        | 3/18 [00:03<00:17,  1.18s/it]

Processing Grade...


Evaluating features:  22%|██▏       | 4/18 [00:04<00:17,  1.22s/it]

Processing weightedScore...


Evaluating features:  28%|██▊       | 5/18 [00:06<00:15,  1.23s/it]

Processing revenue...


Evaluating features:  33%|███▎      | 6/18 [00:07<00:14,  1.24s/it]

Processing netIncome...


Evaluating features:  39%|███▉      | 7/18 [00:08<00:13,  1.25s/it]

Processing eps...


Evaluating features:  44%|████▍     | 8/18 [00:09<00:12,  1.26s/it]

Processing freeCashFlowPerShare...


Evaluating features:  50%|█████     | 9/18 [00:11<00:11,  1.24s/it]

Processing Surprise...


Evaluating features:  56%|█████▌    | 10/18 [00:12<00:09,  1.24s/it]

Processing congress_net_trade...


Evaluating features:  61%|██████    | 11/18 [00:13<00:08,  1.24s/it]

Processing Sentiment...


Evaluating features:  67%|██████▋   | 12/18 [00:14<00:07,  1.25s/it]

Processing stocktwitsSentiment...


Evaluating features:  72%|███████▏  | 13/18 [00:16<00:06,  1.24s/it]

Processing twitterSentiment...


Evaluating features:  78%|███████▊  | 14/18 [00:17<00:04,  1.24s/it]

Processing news_sentimentScore...


Evaluating features:  83%|████████▎ | 15/18 [00:18<00:03,  1.24s/it]

Processing Dividend_y...


Evaluating features:  89%|████████▉ | 16/18 [00:19<00:02,  1.26s/it]

Processing returnOnEquity...


Evaluating features:  94%|█████████▍| 17/18 [00:21<00:01,  1.24s/it]

Processing roic...


Evaluating features: 100%|██████████| 18/18 [00:22<00:00,  1.24s/it]


Removed ivmean30, Best RMSE: 32.8833451296375


Evaluating features:   0%|          | 0/17 [00:00<?, ?it/s]

Processing ^VIX...


Evaluating features:   6%|▌         | 1/17 [00:01<00:17,  1.11s/it]

Processing ivput270...


Evaluating features:  12%|█▏        | 2/17 [00:02<00:16,  1.11s/it]

Processing Grade...


Evaluating features:  18%|█▊        | 3/17 [00:03<00:15,  1.13s/it]

Processing weightedScore...


Evaluating features:  24%|██▎       | 4/17 [00:04<00:14,  1.15s/it]

Processing revenue...


Evaluating features:  29%|██▉       | 5/17 [00:05<00:13,  1.15s/it]

Processing netIncome...


Evaluating features:  35%|███▌      | 6/17 [00:06<00:12,  1.17s/it]

Processing eps...


Evaluating features:  41%|████      | 7/17 [00:08<00:11,  1.19s/it]

Processing freeCashFlowPerShare...


Evaluating features:  47%|████▋     | 8/17 [00:09<00:10,  1.16s/it]

Processing Surprise...


Evaluating features:  53%|█████▎    | 9/17 [00:10<00:09,  1.17s/it]

Processing congress_net_trade...


Evaluating features:  59%|█████▉    | 10/17 [00:11<00:08,  1.16s/it]

Processing Sentiment...


Evaluating features:  65%|██████▍   | 11/17 [00:12<00:07,  1.18s/it]

Processing stocktwitsSentiment...


Evaluating features:  71%|███████   | 12/17 [00:13<00:05,  1.18s/it]

Processing twitterSentiment...


Evaluating features:  76%|███████▋  | 13/17 [00:15<00:04,  1.19s/it]

Processing news_sentimentScore...


Evaluating features:  82%|████████▏ | 14/17 [00:16<00:03,  1.19s/it]

Processing Dividend_y...


Evaluating features:  88%|████████▊ | 15/17 [00:17<00:02,  1.20s/it]

Processing returnOnEquity...


Evaluating features:  94%|█████████▍| 16/17 [00:18<00:01,  1.18s/it]

Processing roic...


Evaluating features: 100%|██████████| 17/17 [00:19<00:00,  1.17s/it]


Removed returnOnEquity, Best RMSE: 32.80830975494912


Evaluating features:   0%|          | 0/16 [00:00<?, ?it/s]

Processing ^VIX...


Evaluating features:   6%|▋         | 1/16 [00:01<00:15,  1.04s/it]

Processing ivput270...


Evaluating features:  12%|█▎        | 2/16 [00:02<00:14,  1.05s/it]

Processing Grade...


Evaluating features:  19%|█▉        | 3/16 [00:03<00:14,  1.08s/it]

Processing weightedScore...


Evaluating features:  25%|██▌       | 4/16 [00:04<00:13,  1.09s/it]

Processing revenue...


Evaluating features:  31%|███▏      | 5/16 [00:05<00:12,  1.12s/it]

Processing netIncome...


Evaluating features:  38%|███▊      | 6/16 [00:06<00:11,  1.12s/it]

Processing eps...


Evaluating features:  44%|████▍     | 7/16 [00:07<00:10,  1.13s/it]

Processing freeCashFlowPerShare...


Evaluating features:  50%|█████     | 8/16 [00:08<00:08,  1.10s/it]

Processing Surprise...


Evaluating features:  56%|█████▋    | 9/16 [00:09<00:07,  1.11s/it]

Processing congress_net_trade...


Evaluating features:  62%|██████▎   | 10/16 [00:11<00:06,  1.13s/it]

Processing Sentiment...


Evaluating features:  69%|██████▉   | 11/16 [00:12<00:05,  1.13s/it]

Processing stocktwitsSentiment...


Evaluating features:  75%|███████▌  | 12/16 [00:13<00:04,  1.11s/it]

Processing twitterSentiment...


Evaluating features:  81%|████████▏ | 13/16 [00:14<00:03,  1.11s/it]

Processing news_sentimentScore...


Evaluating features:  88%|████████▊ | 14/16 [00:15<00:02,  1.11s/it]

Processing Dividend_y...


Evaluating features:  94%|█████████▍| 15/16 [00:16<00:01,  1.11s/it]

Processing roic...


Evaluating features: 100%|██████████| 16/16 [00:17<00:00,  1.11s/it]


Removed Surprise, Best RMSE: 32.459646301235985


Evaluating features:   0%|          | 0/15 [00:00<?, ?it/s]

Processing ^VIX...


Evaluating features:   7%|▋         | 1/15 [00:01<00:14,  1.02s/it]

Processing ivput270...


Evaluating features:  13%|█▎        | 2/15 [00:02<00:13,  1.02s/it]

Processing Grade...


Evaluating features:  20%|██        | 3/15 [00:03<00:12,  1.04s/it]

Processing weightedScore...


Evaluating features:  27%|██▋       | 4/15 [00:04<00:11,  1.07s/it]

Processing revenue...


Evaluating features:  33%|███▎      | 5/15 [00:05<00:10,  1.08s/it]

Processing netIncome...


Evaluating features:  40%|████      | 6/15 [00:06<00:09,  1.08s/it]

Processing eps...


Evaluating features:  47%|████▋     | 7/15 [00:07<00:08,  1.09s/it]

Processing freeCashFlowPerShare...


Evaluating features:  53%|█████▎    | 8/15 [00:08<00:07,  1.06s/it]

Processing congress_net_trade...


Evaluating features:  60%|██████    | 9/15 [00:09<00:06,  1.06s/it]

Processing Sentiment...


Evaluating features:  67%|██████▋   | 10/15 [00:10<00:05,  1.08s/it]

Processing stocktwitsSentiment...


Evaluating features:  73%|███████▎  | 11/15 [00:11<00:04,  1.08s/it]

Processing twitterSentiment...


Evaluating features:  80%|████████  | 12/15 [00:12<00:03,  1.09s/it]

Processing news_sentimentScore...


Evaluating features:  87%|████████▋ | 13/15 [00:13<00:02,  1.09s/it]

Processing Dividend_y...


Evaluating features:  93%|█████████▎| 14/15 [00:15<00:01,  1.09s/it]

Processing roic...


Evaluating features: 100%|██████████| 15/15 [00:16<00:00,  1.07s/it]


No further improvement.
Selected features: ['^VIX', 'ivput270', 'Grade', 'weightedScore', 'revenue', 'netIncome', 'eps', 'freeCashFlowPerShare', 'congress_net_trade', 'Sentiment', 'stocktwitsSentiment', 'twitterSentiment', 'news_sentimentScore', 'Dividend_y', 'roic']
Final RMSE on the hold-out test set: 32.459646301235985
Next day prediction: 400.4585003051758




In [None]:
results_df = pd.DataFrame({
    'Best_Features': best_features,
    'RMSE': best_rmse,
    'Next_Day_Prediction': one_day_pred
})
results_selected_be_features_rmse_and_predictions = results_df
results_selected_be_features_rmse_and_predictions.to_csv('results_selected_be_features_rmse_and_predictions.csv', index=True)
files.download('results_selected_be_features_rmse_and_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
results_selected_be_features_rmse_and_predictions = pd.read_csv('results_selected_be_features_rmse_and_predictions.csv')
results_selected_be_features_rmse_and_predictions = pd.DataFrame(results_selected_be_features_rmse_and_predictions)

# LSTM Model

## LSTM Model with Past Prices only

In [None]:
def lstm_forecast_with_tuning_log(data_series_base, n_steps=5, n_features=1):
  """
  LSTM Model for Past Prices only
  """
  if not isinstance(data_series_base, pd.Series):
    raise ValueError("data_series_base must be a pandas Series.")

  log_returns = np.log(data_series_base / data_series_base.shift(1)).dropna()
  data = log_returns.values

  def preprocess_data(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
      end_ix = i + n_steps
      seq_x, seq_y = data[i:end_ix], data[end_ix]
      X.append(seq_x)
      y.append(seq_y)
    return np.array(X), np.array(y)

  X, y = preprocess_data(data, n_steps)
  X = X.reshape((X.shape[0], X.shape[1], n_features))

  tscv = TimeSeriesSplit(n_splits=4)
  metrics_list = []

  for train_index, test_index in tscv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    def build_model(hp):
      model = Sequential()
      model.add(LSTM(units=hp.Int('input_unit', min_value=32, max_value=256, step=32),
                      return_sequences=True, input_shape=(n_steps, n_features)))
      for i in range(hp.Int('n_layers', 1, 1)):
          model.add(LSTM(units=hp.Int(f'lstm_{i}_units', min_value=32, max_value=256, step=32),
                          return_sequences=(i < hp.Int('n_layers', 1, 1) - 1)))
      model.add(Dropout(hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)))
      model.add(Dense(1, activation=hp.Choice('dense_activation', values=['relu', 'sigmoid'], default='relu')))
      model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mse'])
      return model

    LOG_DIR = f"{int(time.time())}"
    tuner = RandomSearch(build_model,
                          objective=Objective("mse", direction="min"),
                          max_trials=5,
                          executions_per_trial=1,
                          directory=LOG_DIR)

    tuner.search(x=X_train, y=y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=0)

    best_model = tuner.get_best_models(num_models=1)[0]

    y_pred = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r_squared = r2_score(y_test, y_pred)
    n = len(y_test)
    k = 1
    adjusted_r_squared = 1 - ((1 - r_squared) * (n - 1) / (n - k - 1))
    metrics_list.append((rmse, adjusted_r_squared))

  avg_rmse = np.mean([m[0] for m in metrics_list])
  avg_adjusted_r_squared = np.mean([m[1] for m in metrics_list])

  forecast_input = data[-n_steps:].reshape((1, n_steps, n_features))
  forecast = [best_model.predict(forecast_input)[0][0]]
  forecast_input = np.append(forecast_input.flatten()[1:], forecast).reshape((1, n_steps, n_features))
  forecast.append(best_model.predict(forecast_input)[0][0])

  last_price = data_series_base[-1]

  # Forecasted log returns
  log_return_day1 = forecast[0]
  log_return_day2 = forecast[1]

  # Convert log returns to actual prices
  forecast_price_day1 = last_price * np.exp(log_return_day1)
  forecast_price_day2 = forecast_price_day1 * np.exp(log_return_day2)

  results_df = pd.DataFrame({
      'Average RMSE': [avg_rmse],
      'Average Adjusted R-Squared': [avg_adjusted_r_squared],
      '1 Day Forecast': forecast_price_day1,
      '2 Day Forecast': forecast_price_day2
  })

  results_df.index = [data_series_base.name]

  return results_df


In [None]:
lstm_past_prices_results = lstm_forecast_with_tuning_log(msft_df['Adj Close'])











In [None]:
lstm_past_prices_results

Unnamed: 0,Average RMSE,Average Adjusted R-Squared,1 Day Forecast,2 Day Forecast
Adj Close,0.016611,-0.011936,402.701401,402.752624


In [None]:
# lstm_past_prices_results.to_csv('results_lstm_past_prices_results.csv', index=True)
# files.download('results_lstm_past_prices_results.csv')
lstm_past_prices_results = pd.read_csv('results_lstm_past_prices_results.csv')
lstm_past_prices_results = pd.DataFrame(lstm_past_prices_results)

## LSTM Model with All Features

In [None]:
columns_except = [col for col in master_df.columns if col not in ['Adj Close', 'Shifted Adj Close']]

In [None]:
lstm_all_features_results = lstm_model_with_tuning(master_df, 'Shifted Adj Close', columns_except)

Reloading Tuner from my_dir/f/tuner0.json




Reloading Tuner from my_dir/f/tuner0.json
Reloading Tuner from my_dir/f/tuner0.json




Reloading Tuner from my_dir/f/tuner0.json






In [None]:
lstm_all_features_results

Unnamed: 0,Average RMSE,"Forecast March 5, 2024","Forecast March 6, 2024"
0,76.522759,261.475311,261.263824


In [None]:
# lstm_all_features_results.to_csv('results_lstm_all_features.csv', index=True)
# files.download('results_lstm_all_features.csv')
lstm_all_features_results = pd.read_csv('results_lstm_all_features.csv')
lstm_all_features_results = pd.DataFrame(lstm_all_features_results)

## LSTM Model with Selected Features

In [None]:
columns_except = [col for col in df.columns if col not in ['Shifted Adj Close']]
lstm_selected_features_results = lstm_model_with_tuning(df, 'Shifted Adj Close',columns_except)

Trial 15 Complete [00h 00m 10s]
mse: 0.0276452898979187

Best mse So Far: 0.006463322788476944
Total elapsed time: 00h 02m 26s
Reloading Tuner from my_dir/g/tuner0.json
Reloading Tuner from my_dir/g/tuner0.json




Reloading Tuner from my_dir/g/tuner0.json






In [None]:
lstm_selected_features_results

Unnamed: 0,Average RMSE,"Forecast March 5, 2024","Forecast March 6, 2024"
0,35.884046,309.767822,309.995453


In [None]:
# lstm_selected_features_results.to_csv('results_lstm_selected_features.csv', index=True)
# files.download('results_lstm_selected_features.csv')
lstm_selected_features_results = pd.read_csv('results_lstm_selected_features.csv')
lstm_selected_features_results = pd.DataFrame(lstm_selected_features_results)

# Results Summary

In [None]:
rf_results_summary_dict = [
        {
        'Random Forest Model': 'All Features: Basic Model',
        'Test RMSE': rmse_df_all_basic['RMSE Scores'].values[0]
    },
        {
        'Random Forest Model': 'Selected Features: Basic Model',
        'Test RMSE': rmse_df_selected_basic['RMSE Scores'].values[0]
    },
        {
        'Random Forest Model': 'All Features: Iteratively Trained on Top Features',
        'Test RMSE': rmse_df_all_train_top_features['RMSE_Scores'].values[-1]
    },
        {
        'Random Forest Model': 'Selected Features: Iteratively Trained on Top Features',
        'Test RMSE': rmse_df_selected_train_top_features['RMSE_Scores'].values[-1]
    },
        {
        'Random Forest Model': 'All Features: Grid Search CV',
        'Test RMSE': results_all_gs_rmse['RMSE Scores'].values[0]
    },
        {
        'Random Forest Model': 'Selected Features: Grid Search CV',
        'Test RMSE': results_selected_gs_rmse_scores['RMSE Scores'].values[0]
    },
        {
        'Random Forest Model': 'All Features: Forward Selection',
        'Test RMSE': results_all_fs_features_rmse_and_predictions['RMSE'].values[0]
    },
         {
        'Random Forest Model': 'Selected Features: Forward Selection',
        'Test RMSE': results_selected_fs_features_rmse_and_predictions['RMSE'].values[0]
    },
        {
        'Random Forest Model': 'All Features: Backward Elimination',
        'Test RMSE': results_all_be_features_rmse_and_predictions['RMSE'].values[0]
    },
        {
        'Random Forest Model': 'Selected Features: Backward Elimination',
        'Test RMSE': results_selected_be_features_rmse_and_predictions['RMSE'].values[0]
    }
]

rf_results_summary_df = pd.DataFrame(rf_results_summary_dict)
rf_results_summary_df['Test RMSE'] = rf_results_summary_df['Test RMSE'].round(3)
rf_results_summary_df

Unnamed: 0,Random Forest Model,Test RMSE
0,All Features: Basic Model,35.335
1,Selected Features: Basic Model,43.164
2,All Features: Iteratively Trained on Top Features,31.994
3,Selected Features: Iteratively Trained on Top ...,45.869
4,All Features: Grid Search CV,35.236
5,Selected Features: Grid Search CV,31.213
6,All Features: Forward Selection,113.434
7,Selected Features: Forward Selection,39.57
8,All Features: Backward Elimination,34.147
9,Selected Features: Backward Elimination,32.46


From our various Random Forest models, the model with the Grid Search Cross Validation on Selected Features has the lowest RMSE on test data. The feature importance data from this model is presented below:

In [None]:
results_selected_gs_features

Unnamed: 0,Feature,Score
0,totalAssets,0.129405
1,operatingCashFlowPerShare,0.126714
2,netIncome,0.09654
3,ebitda,0.093344
4,eps,0.092403
5,revenue,0.080148
6,freeCashFlowPerShare,0.070664
7,roic,0.046475
8,dividendsPaid,0.04511
9,^VIX,0.042469


In [None]:
top_10_features = results_selected_gs_features.head(10)
remaining_features = results_selected_gs_features.iloc[10:]

# Sum the scores of the remaining features
sum_of_remaining = remaining_features['Score'].sum()

# Create a new row for the sum of remaining features and label it 'Other'
other_row = pd.DataFrame(data={'Feature': ['Other'], 'Score': [sum_of_remaining]})

# Append this row to the top 10 features DataFrame
final_df = pd.concat([top_10_features, other_row], ignore_index=True)

fig = px.pie(final_df, values='Score', names='Feature',
             title='Top 10 Features and Scores',
             color_discrete_sequence=px.colors.qualitative.Pastel1)


fig.update_traces(textposition='outside', textinfo='label+percent',
                  hoverinfo='label+percent',
                  insidetextorientation='radial')


fig.update_layout(
    uniformtext_minsize=10,
    uniformtext_mode='hide',
    showlegend=False,
    autosize=False,
    width=1000,
    height=600,
    template='plotly_white'
)

# Create a list of features in "Other" and format it as a single string
other_features_list = remaining_features['Feature'].tolist()
other_features_text = "Other Features (17.7%):<br>" + "<br>".join(other_features_list)

# Add the annotation with the list of "Other" features on the right side of the chart
fig.add_annotation(
    text=other_features_text,
    align='left',
    showarrow=False,
    xref='paper',
    yref='paper',
    x=1.3,
    y=1.2
)

fig.show()

This output suggests that the model recognizes the company size (Total Assets) and the efficiency of its operations (Operating Cash Flow Per Share) to be the most significant factor in affecting the stock price of the following day. These features reflect the company's financial health. The following few features, Net Income, EBITDA and EPS are all key indicators of the company's earnings and profitability and are critical to investors. The output of this model aligns with domain knowledge and financial theory regarding the most significant factors to affect a company's stock behavior. Further analysis and interpretation will be discussed in the project report.

Below is an examination of this Randon Forest Model's predictive ability compared to a simple baseline model, an LSTM model with past prices, an LSTM model with all features and an LSTM model with the selected features. The actual price of MSFT on March 5th, 2024 was 402.65.

In [None]:
data = [
    {
        'Model': 'Simple Baseline Model',
        'Test RMSE': average_rmse_over_timeframe,
        'Predicted Price': simple_prediction[0],
        'Validation RMSE': simple_prediction[1]
    },
    {
        'Model': 'LSTM Log of Past Prices',
        'Test RMSE': lstm_past_prices_results['Average RMSE'].values[0],
        'Predicted Price': lstm_past_prices_results['1 Day Forecast'].values[0],
        'Validation RMSE': np.sqrt(mean_squared_error([402.65], [lstm_past_prices_results['1 Day Forecast'].values[0]]))
    },
        {
        'Model': 'LSTM All Features',
        'Test RMSE': lstm_all_features_results['Average RMSE'].values[0],
        'Predicted Price': lstm_all_features_results['Forecast March 5, 2024'].values[0],
        'Validation RMSE': np.sqrt(mean_squared_error([402.65], [lstm_all_features_results['Forecast March 5, 2024'].values[0]]))
    },
        {
        'Model': 'LSTM Selected Features',
        'Test RMSE': lstm_selected_features_results['Average RMSE'].values[0],
        'Predicted Price': lstm_selected_features_results['Forecast March 5, 2024'].values[0],
        'Validation RMSE': np.sqrt(mean_squared_error([402.65], [lstm_selected_features_results['Forecast March 5, 2024'].values[0]]))
    },
        {
        'Model': 'Selected Features: Grid Search CV',
        'Test RMSE': results_selected_gs_rmse_scores['RMSE Scores'].values[0],
        'Predicted Price': results_selected_gs_prediction['Prediction'].values[0],
        'Validation RMSE': np.sqrt(mean_squared_error([402.65], [results_selected_gs_prediction['Prediction'].values[0]]))
    }

]

results_summary_df = pd.DataFrame(data)
results_summary_df['Test RMSE'] = results_summary_df['Test RMSE'].round(3)
results_summary_df['Predicted Price'] = results_summary_df['Predicted Price'].round(3)
results_summary_df['Validation RMSE'] = results_summary_df['Validation RMSE'].round(3)
results_summary_df

Unnamed: 0,Model,Test RMSE,Predicted Price,Validation RMSE
0,Simple Baseline Model,3.659,415.394,12.744
1,LSTM Log of Past Prices,0.017,402.701,0.051
2,LSTM All Features,76.523,261.475,141.175
3,LSTM Selected Features,35.884,309.768,92.882
4,Selected Features: Grid Search CV,31.213,401.352,1.298


In [None]:
sorted_df = results_summary_df.sort_values('Predicted Price', ascending=False)


fig = go.Figure()

# Add bars for 'Predicted Price'
fig.add_trace(go.Bar(x=sorted_df['Model'], y=sorted_df['Predicted Price'],
                     name='Predicted Price', marker_color='#FFCB05',
                     text=sorted_df['Predicted Price'],
                     textposition='outside',
                     texttemplate='%{text:.2f}'))

# Add a horizontal line for the actual price
fig.add_hline(y=402.65, line_color="#00274C",
              annotation_text="Actual Price $402.65", annotation_position="top right",
              annotation_font_color="#00274C")

# Customize layout
fig.update_layout(
    title='1 Day Predicted Price by Model',
    xaxis_title='Model',
    yaxis_title='Predicted Price ($)',
    xaxis_tickangle=-20,
    template='plotly_white',
    yaxis=dict(range=[200, 500])
)

# Show the figure
fig.show()

The LSTM Model with Past Prices appears the most accurate both in testing and validation predictions. The Random Forest Grid Search Model with Selected Features also shows a reasonable performance on unseen data despite an elevated test RMSE. The remaining models do not appear to generalize well on unseen data, indicating poor predictive ability.