In [2]:
# notebooks/stock_analysis.ipynb

# Import system references
import sys
import os

# Ensure project_root is in the system path
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
    
#print("Project root added to sys.path:", project_root in sys.path)
#print(sys.path)



In [3]:
#setup Logging
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')



In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import torch
import torch.nn as nn
import torch.optim as optim

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler

# Import the fetch_stock_data function
from scripts import fetch_stock_data, transform_stock_data_to_delta, transform_with_history
from scripts import prepare_data_for_training, create_time_series_windows, gru_model, train_gru_model, prepare_sliding_window_data

In [5]:
# Ask the user for a ticker symbol
ticker = input("Please enter a stock ticker symbol (e.g., 'AAPL', 'SPY', 'QQQ', etc.): ").upper()

# Define start and end dates
start_date = '2021-01-01'

# If end_date is not provided, you can set it to today's date
end_date = datetime.today().strftime('%Y-%m-%d')  # Default to today's date if not specified

# Print the ticker and date range to confirm
print(f"Fetching data for {ticker} from {start_date} to {end_date}")


Fetching data for SPY from 2021-01-01 to 2024-10-27


In [6]:
# Fetch stock data
stock_data_df = fetch_stock_data(ticker, start_date, end_date)
#stock_data_df = fetch_stock_data(ticker, start_date)

# Check if data is fetched successfully
if not stock_data_df.empty:
    # Display the first few rows
    display(stock_data_df.head())
else:
    print("No data to display.")

2024-10-27 16:22:16,766 - INFO - End date provided: 2024-10-27
2024-10-27 16:22:16,766 - INFO - Fetching data for ticker: SPY from 2021-01-01 to 2024-10-27
2024-10-27 16:22:19,960 - INFO - Resetting index to make 'Date' a column.
2024-10-27 16:22:19,960 - INFO - Flattening multi-level column names (removing ticker symbol).
2024-10-27 16:22:19,960 - INFO - Successfully fetched and simplified data for ticker 'SPY'.


Price,Date,Adj Close,Close,High,Low,Open,Volume
0,2021-01-04 00:00:00+00:00,349.47168,368.790009,375.450012,364.820007,375.309998,110210800
1,2021-01-05 00:00:00+00:00,351.878662,371.329987,372.5,368.049988,368.100006,66426200
2,2021-01-06 00:00:00+00:00,353.982361,373.549988,376.980011,369.119995,369.709991,107997700
3,2021-01-07 00:00:00+00:00,359.241608,379.100006,379.899994,375.910004,376.100006,68766800
4,2021-01-08 00:00:00+00:00,361.288513,381.26001,381.48999,377.100006,380.589996,71677200


In [30]:
# Transform Stock Data to Deltas
columns_to_exclude = ['Adj Close']  # Drop 'Adj Close'
columns_to_keep = ['Volume']  # Keep 'Volume' but exclude from delta calculation
columns_to_calculate = ['Open', 'High', 'Low', 'Close']  # Calculate deltas for 'Open' and 'Close'

transformed_data_df = transform_stock_data_to_delta(
    stock_data_df, 
    columns_to_exclude=columns_to_exclude, 
    columns_to_calculate=columns_to_calculate, 
    columns_to_keep=columns_to_keep
)

#transformed_data_df = transform_stock_data_to_delta(stock_data_df)
#transformed_data_df = transform_stock_data_to_delta(stock_data_df, exclude=['Volume'])

# Display Transformed Data
if not transformed_data_df.empty:
    display(transformed_data_df.head())
    display(transformed_data_df.tail())
else:
    print("No transformed data to display.")


2024-10-27 16:48:26,090 - INFO - Starting transformation of stock data to deltas.
2024-10-27 16:48:26,090 - INFO - Dropping columns: ['Adj Close']
2024-10-27 16:48:26,107 - INFO - Calculating deltas for specified columns: ['Open', 'High', 'Low', 'Close']
2024-10-27 16:48:26,109 - INFO - Transforming column: Open
2024-10-27 16:48:26,114 - INFO - Transforming column: High
2024-10-27 16:48:26,115 - INFO - Transforming column: Low
2024-10-27 16:48:26,115 - INFO - Transforming column: Close
2024-10-27 16:48:26,122 - INFO - Dropping the first row with NaN values after delta calculation.
2024-10-27 16:48:26,132 - INFO - Successfully transformed stock data to deltas.


Price,Date,Close,High,Low,Open,Volume,Open_delta,High_delta,Low_delta,Close_delta
1,2021-01-05 00:00:00+00:00,371.329987,372.5,368.049988,368.100006,66426200,-0.0192,-0.0079,0.0089,0.0069
2,2021-01-06 00:00:00+00:00,373.549988,376.980011,369.119995,369.709991,107997700,0.0044,0.012,0.0029,0.006
3,2021-01-07 00:00:00+00:00,379.100006,379.899994,375.910004,376.100006,68766800,0.0173,0.0077,0.0184,0.0149
4,2021-01-08 00:00:00+00:00,381.26001,381.48999,377.100006,380.589996,71677200,0.0119,0.0042,0.0032,0.0057
5,2021-01-11 00:00:00+00:00,378.690002,380.579987,377.720001,377.850006,51034700,-0.0072,-0.0024,0.0016,-0.0067


Price,Date,Close,High,Low,Open,Volume,Open_delta,High_delta,Low_delta,Close_delta
955,2024-10-21 00:00:00+00:00,583.630005,584.849976,580.599976,583.849976,36439000,-0.0004,-0.0009,-0.0034,-0.0016
956,2024-10-22 00:00:00+00:00,583.320007,584.5,580.380005,581.049988,34183800,-0.0048,-0.0006,-0.0004,-0.0005
957,2024-10-23 00:00:00+00:00,577.98999,581.710022,574.419983,581.26001,49314600,0.0004,-0.0048,-0.0103,-0.0091
958,2024-10-24 00:00:00+00:00,579.23999,580.059998,576.570007,579.97998,34979900,-0.0022,-0.0028,0.0037,0.0022
959,2024-10-25 00:00:00+00:00,579.039978,584.460022,578.080017,581.51001,47105000,0.0026,0.0076,0.0026,-0.0003


In [8]:
# Prepare the data by removing non-delta columns
delta_only_df = prepare_sliding_window_data(transformed_data_df)

# Display the first few rows of the resulting DataFrame
display(delta_only_df.head())


Price,Open_delta,High_delta,Low_delta,Close_delta
1,-0.0192,-0.0079,0.0089,0.0069
2,0.0044,0.012,0.0029,0.006
3,0.0173,0.0077,0.0184,0.0149
4,0.0119,0.0042,0.0032,0.0057
5,-0.0072,-0.0024,0.0016,-0.0067


In [9]:
# Define parameters
n_timesteps = 100  # Number of timesteps (sequence length)

# Step 1: Create the time series windows (X and y)
X, y = create_time_series_windows(delta_only_df, 'Close_delta', n_timesteps)

# Display the first few occurrences of the X and y arrays
print("First 5 entries of X:\n", X[:5])  # Display first 5 rows
print("First 5 entries of y:\n", y[:5])  # Display first 5 target values

# Step 2: Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Display the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


First 5 entries of X:
 [[[-0.0192 -0.0079  0.0089  0.0069]
  [ 0.0044  0.012   0.0029  0.006 ]
  [ 0.0173  0.0077  0.0184  0.0149]
  ...
  [ 0.0072  0.0009  0.0013 -0.0022]
  [-0.0035 -0.0026  0.0003  0.002 ]
  [ 0.0031  0.0026  0.0029  0.0005]]

 [[ 0.0044  0.012   0.0029  0.006 ]
  [ 0.0173  0.0077  0.0184  0.0149]
  [ 0.0119  0.0042  0.0032  0.0057]
  ...
  [-0.0035 -0.0026  0.0003  0.002 ]
  [ 0.0031  0.0026  0.0029  0.0005]
  [ 0.0019  0.0013  0.0019  0.0018]]

 [[ 0.0173  0.0077  0.0184  0.0149]
  [ 0.0119  0.0042  0.0032  0.0057]
  [-0.0072 -0.0024  0.0016 -0.0067]
  ...
  [ 0.0031  0.0026  0.0029  0.0005]
  [ 0.0019  0.0013  0.0019  0.0018]
  [ 0.0038  0.0035 -0.0014 -0.0009]]

 [[ 0.0119  0.0042  0.0032  0.0057]
  [-0.0072 -0.0024  0.0016 -0.0067]
  [ 0.0028 -0.0019 -0.0036  0.0002]
  ...
  [ 0.0019  0.0013  0.0019  0.0018]
  [ 0.0038  0.0035 -0.0014 -0.0009]
  [-0.0052 -0.0035  0.0002  0.0016]]

 [[-0.0072 -0.0024  0.0016 -0.0067]
  [ 0.0028 -0.0019 -0.0036  0.0002]
  [-0.000

In [10]:
# Initialize the scaler for X and y values
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# Reshape X data to 2D for scaling, keeping the last dimension as features
n_samples_train = X_train.shape[0]
n_samples_test = X_test.shape[0]

X_train_reshaped = X_train.reshape(-1, X_train.shape[-1])  # Reshape to 2D: [samples * timesteps, features]
X_test_reshaped = X_test.reshape(-1, X_test.shape[-1])

# Apply scaling to X features (fit on X_train, transform both X_train and X_test)
X_train_scaled = scaler_X.fit_transform(X_train_reshaped).reshape(n_samples_train, X_train.shape[1], X_train.shape[2])
X_test_scaled = scaler_X.transform(X_test_reshaped).reshape(n_samples_test, X_test.shape[1], X_test.shape[2])

# Reshape y values to 2D (required by MinMaxScaler)
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

# Apply scaling to y values
y_train_scaled = scaler_y.fit_transform(y_train)
y_test_scaled = scaler_y.transform(y_test)



In [20]:
# Define parameters
#n_timesteps = 25
n_features = X_train_scaled.shape[2]

# Define the number of layers and units per layer
num_layers = 3
units_per_layer = 96  # 3 layers with 64 units for each layer

# Call the GRU model
y_pred_scaled, model = train_gru_model(X_train, y_train, X_test, y_test, 
                                        n_timesteps, n_features, 
                                        num_layers, units_per_layer, 
                                        learning_rate=0.00001, epochs=100, batch_size=32)
# Output the predictions
#print("Predictions from GRU model:", y_pred_scaled)



Epoch [5/100], Loss: 0.0005
Epoch [10/100], Loss: 0.0003
Epoch [15/100], Loss: 0.0002
Epoch [20/100], Loss: 0.0001
Epoch [25/100], Loss: 0.0001
Epoch [30/100], Loss: 0.0001
Epoch [35/100], Loss: 0.0001
Epoch [40/100], Loss: 0.0001
Epoch [45/100], Loss: 0.0001
Epoch [50/100], Loss: 0.0001
Epoch [55/100], Loss: 0.0001
Epoch [60/100], Loss: 0.0001
Epoch [65/100], Loss: 0.0001
Epoch [70/100], Loss: 0.0001
Epoch [75/100], Loss: 0.0001
Epoch [80/100], Loss: 0.0001
Epoch [85/100], Loss: 0.0001
Epoch [90/100], Loss: 0.0001
Epoch [95/100], Loss: 0.0001
Epoch [100/100], Loss: 0.0001
Test Loss: 0.0001


In [25]:
# Step 1: Inverse transform the predicted and actual y values to the original scale
y_pred_original = scaler_y.inverse_transform(y_pred_scaled)
y_test_original = scaler_y.inverse_transform(y_test_scaled)

# Display the first few occurrences of prediction and test scaled and unscaled arrays
print("First 5 entries of y_pred_scaled", y_pred_scaled[:5])  # Display first 5 rows
print("First 5 entries of y_pred_original", y_pred_original[:5])  # Display first 5 rows
print("First 5 entries of y_test_scaled", y_test_scaled[:5])  # Display first 5 target values
print("First 5 entries of y_test_original", y_test_original[:5])  # Display first 5 target values

# Mean Squared Error
mse = mean_squared_error(y_test_original, y_pred_original)
# Root Mean Squared Error
rmse = np.sqrt(mse)
# Mean Absolute Error
mae = mean_absolute_error(y_test_original, y_pred_original)
# R-squared
r2 = r2_score(y_test_original, y_pred_original)

# Print the metrics
print(f"Mean Squared Error (MSE): {mse:.6f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.6f}")
print(f"Mean Absolute Error (MAE): {mae:.6f}")
print(f"R-squared (R²): {r2:.6f}")

First 5 entries of y_pred_scaled [[0.00057942]
 [0.0003001 ]
 [0.00055862]
 [0.00080161]
 [0.00053019]]
First 5 entries of y_pred_original [[-0.04344292]
 [-0.04347044]
 [-0.04344498]
 [-0.04342104]
 [-0.04344778]]
First 5 entries of y_test_scaled [[0.37664975]
 [0.5177665 ]
 [0.41725888]
 [0.49137056]
 [0.76142132]]
First 5 entries of y_test_original [[-0.0064]
 [ 0.0075]
 [-0.0024]
 [ 0.0049]
 [ 0.0315]]
Mean Squared Error (MSE): 0.001977
Root Mean Squared Error (RMSE): 0.044462
Mean Absolute Error (MAE): 0.043154
R-squared (R²): -16.240037


In [26]:
# Create a DataFrame to compare actual and predicted values
comparison_df = pd.DataFrame({'Actual': y_test_original.flatten(), 'Predicted': y_pred_original.flatten()})
# Calculate the difference (error)
comparison_df['Difference'] = comparison_df['Actual'] - comparison_df['Predicted']
#print(comparison_df.head(25))

In [27]:
# Get the sign of the actual and predicted values
actual_sign = np.sign(comparison_df['Actual'])
predicted_sign = np.sign(comparison_df['Predicted'])
#print(actual_sign.head(25))
#print(predicted_sign.head(25))
# Check where the signs match
sign_matches = actual_sign == predicted_sign
#print(sign_matches.head(25))

# Calculate the percentage of sign matches
percentage_match = sign_matches.mean() * 100

print(f"Percentage of Sign Matches: {percentage_match :.2f}%")

Percentage of Sign Matches: 50.00%


In [29]:
# Get the prediction for tomorrow's Close_delta

# Get the prediction for tomorrow's Close_delta

"""
Predict the next Close_delta based on the most recent n_timesteps of data using the trained PyTorch model.

Using:
- model: The trained PyTorch GRU model.
- delta_only_df (pd.DataFrame): The DataFrame containing all historical data, including the latest Close_delta.
- n_timesteps (int): The number of timesteps (sequence length) used in the model.
- scaler_X: The scaler used to scale the features (X).
- scaler_y: The scaler used to scale the target (y).

Returns:
- float: The predicted Close_delta for the next day.
"""
model.eval()  # Set the model to evaluation mode

# Step 1: Extract the last n_timesteps rows from the data (to be used as input for prediction)
last_window = delta_only_df[-n_timesteps:].values.reshape(1, n_timesteps, delta_only_df.shape[1])

# Step 2: Scale the input features
last_window_scaled = scaler_X.transform(last_window.reshape(-1, last_window.shape[-1])).reshape(1, n_timesteps, -1)

# Step 3: Convert the scaled input to a PyTorch tensor
last_window_tensor = torch.tensor(last_window_scaled, dtype=torch.float32)

# Step 4: Use the model to predict the next Close_delta (scaled)
with torch.no_grad():  # Ensure no gradients are calculated
    predicted_close_delta_scaled = model(last_window_tensor)

# Step 5: Convert the PyTorch tensor to a NumPy array
predicted_close_delta_scaled = predicted_close_delta_scaled.numpy()

# Step 6: Inverse transform the prediction to get the original scale of Close_delta
predicted_close_delta = scaler_y.inverse_transform(predicted_close_delta_scaled)

predicted_close_delta = predicted_close_delta[0][0]  # Return the predicted value

# Decision logic: Buy if positive, Sell if negative
if predicted_close_delta > 0:
    print(f"Predicted Close increase for {ticker} tomorrow is {(predicted_close_delta * 100):.2f}%. Suggestion: BUY with {percentage_match :.2f}% confidence")
else:
    print(f"Predicted Close decrease for {ticker} tomorrow is {(predicted_close_delta * 100):.2f}%. Suggestion: SELL with {percentage_match :.2f}% confidence")


Predicted Close decrease for tomorrow is -4.09%. Suggestion: SELL with 50.00% confidence
