In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import datetime as dt
import ta
import keras
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
import joblib

In [2]:
df=pd.read_csv('data/merged_data.csv')
df.info

<bound method DataFrame.info of             Date        Open        High         Low       Close     Volume  \
0     2010-01-04    6.414464    6.446622    6.382907    6.431896  493729600   
1     2010-01-05    6.449629    6.479382    6.409055    6.443017  601904800   
2     2010-01-06    6.443018    6.468564    6.333921    6.340533  552160000   
3     2010-01-07    6.363974    6.371488    6.282827    6.328810  477131200   
4     2010-01-08    6.320397    6.371490    6.283131    6.370888  447610800   
...          ...         ...         ...         ...         ...        ...   
3622  2024-05-24  187.941069  189.692870  187.164685  189.095657   36294600   
3623  2024-05-28  190.618535  192.101605  188.219765  189.105621   52280100   
3624  2024-05-29  188.727378  191.355088  188.627837  189.404205   53068000   
3625  2024-05-30  189.872035  191.285423  189.742650  190.399567   49947900   
3626  2024-05-31  190.548875  191.673620  189.025999  191.355103   75158300   

          SMA_10   

In [3]:
STOCK_TICKER = "AAPL"
START_DATE = "2010-01-01"
END_DATE = "2024-06-01" 
LOOK_BACK_DAYS = 60      # Number of past days to consider for prediction (timesteps)
PREDICTION_DAYS = 30     # Days to reserve for the test set (from end of data)
EPOCHS = 100             
BATCH_SIZE = 32
LSTM_UNITS = 50          
DROPOUT_RATE = 0.3       
CNN_FILTERS = 64         
CNN_KERNEL_SIZE = 3 
LEARNING_RATE = 0.0005   

# Early Stopping parameters
PATIENCE = 20            
MIN_DELTA = 0.001 

In [4]:
MODEL_SAVE_PATH = 'models/stock_prediction_model.keras' 
SCALER_SAVE_PATH = 'models/feature_scaler.pkl'
TARGET_SCALER_SAVE_PATH = 'models/target_scaler.pkl'

In [5]:
initial_rows_before_dropna = len(df)
df.dropna(inplace=True)
num_rows_dropped = initial_rows_before_dropna - len(df)
print(f"Dropped {num_rows_dropped} rows containing NaN values from indicator calculations.")

Dropped 49 rows containing NaN values from indicator calculations.


In [6]:
selected_features=['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'SMA_10', 'EMA_10',
       'SMA_50', 'EMA_50', 'RSI', 'MACD', 'MACD_Signal', 'MACD_Diff',
       'BB_High', 'BB_Low', 'BB_Mid', 'BB_Width', 'Log_Return',
       'Volatility_20', 'sentiment_compound', 'sentiment_positive',
       'sentiment_negative', 'sentiment_neutral']
features = ['Close', 'Volume', 'SMA_10', 'EMA_10', 'SMA_50', 'EMA_50', 'RSI',
            'MACD', 'MACD_Signal', 'MACD_Diff', 'BB_High', 'BB_Low', 'BB_Mid',
            'BB_Width', 'Volatility_20']
target = 'Close'

In [7]:
# Scale all features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_features = scaler.fit_transform(df[features].values)

# Create a separate scaler for the target variable (Close price)
target_scaler = MinMaxScaler(feature_range=(0, 1))
target_scaler.fit(df[[target]].values) 

In [8]:
# We need to ensure there are enough samples for both train and test, plus look_back_days
if len(scaled_features) <= LOOK_BACK_DAYS + PREDICTION_DAYS+1:
    print(f"Warning: Not enough data for required LOOK_BACK_DAYS ({LOOK_BACK_DAYS}) + PREDICTION_DAYS ({PREDICTION_DAYS}).")
    print(f"Total available data after NA drop: {len(scaled_features)} rows.")
    print("Please increase your START_DATE or reduce LOOK_BACK_DAYS/PREDICTION_DAYS.")
    exit()

# The split point for the features array
split_point_features = len(scaled_features) - PREDICTION_DAYS

if split_point_features < LOOK_BACK_DAYS:
     print(f"Error: Not enough data to create training sequences. Split point ({split_point_features}) is less than LOOK_BACK_DAYS ({LOOK_BACK_DAYS}).")
     print("Adjust START_DATE, END_DATE, LOOK_BACK_DAYS, or PREDICTION_DAYS.")
     exit()

train_data_scaled = scaled_features[0:split_point_features, :]
test_data_scaled = scaled_features[split_point_features - LOOK_BACK_DAYS:, :]

In [9]:
def create_sequences_multifeature(data, target_index_in_data, look_back):
    X, y = [], []
    for i in range(look_back, len(data)):
        X.append(data[i-look_back:i, :]) 
        y.append(data[i, target_index_in_data]) # Predict target for the next day
    return np.array(X), np.array(y)

# Get the index of the 'Close' price in our `features` list
target_feature_index = features.index(target)

X_train, y_train_unshaped = create_sequences_multifeature(train_data_scaled, target_feature_index, LOOK_BACK_DAYS)
X_test, y_test_unshaped = create_sequences_multifeature(test_data_scaled, target_feature_index, LOOK_BACK_DAYS)

X_train = X_train.astype(np.float32)
y_train_scaled = y_train_unshaped.reshape(-1, 1).astype(np.float32)
X_test = X_test.astype(np.float32)
y_test_scaled = y_test_unshaped.reshape(-1, 1).astype(np.float32)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train_scaled.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test_scaled.shape}")

X_train shape: (3488, 60, 15)
y_train shape: (3488, 1)
X_test shape: (30, 60, 15)
y_test shape: (30, 1)


In [10]:
if X_train.shape[0] == 0 or X_train.shape[1] == 0 or X_train.shape[2] == 0:
    print(f"Error: X_train has a zero dimension (shape={X_train.shape}). Check data slicing and LOOK_BACK_DAYS.")
    exit()
if y_train_unshaped.shape[0] == 0:
    print(f"Error: y_train_unshaped has 0 samples (shape={y_train_unshaped.shape}). Check data slicing and LOOK_BACK_DAYS.")
    exit()
if X_test.shape[0] == 0 or X_test.shape[1] == 0 or X_test.shape[2] == 0:
    print(f"Error: X_test has a zero dimension (shape={X_test.shape}). Check data slicing and LOOK_BACK_DAYS/PREDICTION_DAYS.")
    exit()
if y_test_unshaped.shape[0] == 0:
    print(f"Error: y_test_unshaped has 0 samples (shape={y_test_unshaped.shape}). Check data slicing and LOOK_BACK_DAYS/PREDICTION_DAYS.")
    exit()

print(f"Final shape of X_train for model: {X_train.shape}")
print(f"Final shape of y_train_scaled for model: {y_train_scaled.shape}")
print(f"Final shape of X_test for model: {X_test.shape}")
print(f"Final shape of y_test_scaled for model: {y_test_scaled.shape}")
print(f"Dtype of X_train: {X_train.dtype}")
print(f"Dtype of y_train_scaled: {y_train_scaled.dtype}")

Final shape of X_train for model: (3488, 60, 15)
Final shape of y_train_scaled for model: (3488, 1)
Final shape of X_test for model: (30, 60, 15)
Final shape of y_test_scaled for model: (30, 1)
Dtype of X_train: float32
Dtype of y_train_scaled: float32


In [11]:
# Define EarlyStopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=PATIENCE,   # Stop if val_loss doesn't improve for this many epochs
    min_delta=MIN_DELTA, # Minimum change to be considered an improvement
    mode='min',          # Look for minimum val_loss
    restore_best_weights=True, # Restore model weights from the epoch with the best value of the monitored quantity.
    verbose=1
)

In [12]:
"""# %%

print("\n--- Building CNN Model ---")

# Define the CNN model architecture
model_cnn = Sequential()
model_cnn.add(Conv1D(filters=CNN_FILTERS, kernel_size=CNN_KERNEL_SIZE, activation='relu',
                     input_shape=(X_train.shape[1], X_train.shape[2])))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Dropout(DROPOUT_RATE))
model_cnn.add(Conv1D(filters=CNN_FILTERS // 2, kernel_size=CNN_KERNEL_SIZE, activation='relu')) # Add another Conv layer
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Dropout(DROPOUT_RATE))
model_cnn.add(tf.keras.layers.Flatten()) # Flatten the output for the Dense layer
model_cnn.add(Dense(units=LSTM_UNITS, activation='relu')) # Use LSTM_UNITS for a dense layer size
model_cnn.add(Dense(units=1)) # Output layer for regression

model_cnn.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), loss='mean_squared_error')
model_cnn.summary()

# Train the CNN model
print("\nTraining Standalone CNN Model...")
history_cnn = model_cnn.fit(X_train, y_train_scaled, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1,
                            validation_split=0.1, callbacks=[early_stopping])
"""

'# %%\n\nprint("\n--- Building CNN Model ---")\n\n# Define the CNN model architecture\nmodel_cnn = Sequential()\nmodel_cnn.add(Conv1D(filters=CNN_FILTERS, kernel_size=CNN_KERNEL_SIZE, activation=\'relu\',\n                     input_shape=(X_train.shape[1], X_train.shape[2])))\nmodel_cnn.add(MaxPooling1D(pool_size=2))\nmodel_cnn.add(Dropout(DROPOUT_RATE))\nmodel_cnn.add(Conv1D(filters=CNN_FILTERS // 2, kernel_size=CNN_KERNEL_SIZE, activation=\'relu\')) # Add another Conv layer\nmodel_cnn.add(MaxPooling1D(pool_size=2))\nmodel_cnn.add(Dropout(DROPOUT_RATE))\nmodel_cnn.add(tf.keras.layers.Flatten()) # Flatten the output for the Dense layer\nmodel_cnn.add(Dense(units=LSTM_UNITS, activation=\'relu\')) # Use LSTM_UNITS for a dense layer size\nmodel_cnn.add(Dense(units=1)) # Output layer for regression\n\nmodel_cnn.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), loss=\'mean_squared_error\')\nmodel_cnn.summary()\n\n# Train the CNN model\nprint("\nTraining Standalone CN

In [13]:
CNN_MODEL_SAVE_PATH='models/cnn_model.keras'
"""print("\nSaving the trained CNN model...")
model_cnn.save(CNN_MODEL_SAVE_PATH)
print(f"Model saved to: {CNN_MODEL_SAVE_PATH}")"""

'print("\nSaving the trained CNN model...")\nmodel_cnn.save(CNN_MODEL_SAVE_PATH)\nprint(f"Model saved to: {CNN_MODEL_SAVE_PATH}")'

In [14]:
"""# %%
# --- LSTM Model ---
print("\n--- Building and Training Standalone LSTM Model ---")

# Define the LSTM model architecture
model_lstm = Sequential()
model_lstm.add(LSTM(units=LSTM_UNITS, return_sequences=True,
                    input_shape=(X_train.shape[1], X_train.shape[2])))
model_lstm.add(Dropout(DROPOUT_RATE))
model_lstm.add(LSTM(units=LSTM_UNITS)) # No return_sequences for the last LSTM layer
model_lstm.add(Dropout(DROPOUT_RATE))
model_lstm.add(Dense(units=1)) # Output layer for regression

model_lstm.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), loss='mean_squared_error')
model_lstm.summary()

# Train the LSTM model
print("\nTraining Standalone LSTM Model...")
history_lstm = model_lstm.fit(X_train, y_train_scaled, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1,
                             validation_split=0.1, callbacks=[early_stopping])

"""

'# %%\n# --- LSTM Model ---\nprint("\n--- Building and Training Standalone LSTM Model ---")\n\n# Define the LSTM model architecture\nmodel_lstm = Sequential()\nmodel_lstm.add(LSTM(units=LSTM_UNITS, return_sequences=True,\n                    input_shape=(X_train.shape[1], X_train.shape[2])))\nmodel_lstm.add(Dropout(DROPOUT_RATE))\nmodel_lstm.add(LSTM(units=LSTM_UNITS)) # No return_sequences for the last LSTM layer\nmodel_lstm.add(Dropout(DROPOUT_RATE))\nmodel_lstm.add(Dense(units=1)) # Output layer for regression\n\nmodel_lstm.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), loss=\'mean_squared_error\')\nmodel_lstm.summary()\n\n# Train the LSTM model\nprint("\nTraining Standalone LSTM Model...")\nhistory_lstm = model_lstm.fit(X_train, y_train_scaled, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1,\n                             validation_split=0.1, callbacks=[early_stopping])\n\n'

In [15]:
LSTM_MODEL_SAVE_PATH='models/lstm_model.keras'
"""print("\nSaving the trained LSTM model...")
model_lstm.save(LSTM_MODEL_SAVE_PATH)
print(f"Model saved to: {LSTM_MODEL_SAVE_PATH}")"""

'print("\nSaving the trained LSTM model...")\nmodel_lstm.save(LSTM_MODEL_SAVE_PATH)\nprint(f"Model saved to: {LSTM_MODEL_SAVE_PATH}")'

In [16]:
loaded_cnn_model = load_model(CNN_MODEL_SAVE_PATH)
loaded_scaler = joblib.load(SCALER_SAVE_PATH)
loaded_target_scaler = joblib.load(TARGET_SCALER_SAVE_PATH)
print("\nEvaluating CNN model performance on the test set...")
y_pred_scaled = loaded_cnn_model.predict(X_test)

# Inverse transform predictions and actual values to original scale
y_test_original = loaded_target_scaler.inverse_transform(y_test_scaled)
y_pred_original = loaded_target_scaler.inverse_transform(y_pred_scaled)

rmse_cnn = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
mae_cnn = mean_absolute_error(y_test_original, y_pred_original)

print(f"Root Mean Squared Error (RMSE): {rmse_cnn:.2f}")
print(f"Mean Absolute Error (MAE): {mae_cnn:.2f}")


Evaluating CNN model performance on the test set...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 432ms/step
Root Mean Squared Error (RMSE): 47.96
Mean Absolute Error (MAE): 47.33


In [17]:
loaded_lstm_model = load_model(LSTM_MODEL_SAVE_PATH)
print("\nEvaluating LSTM model performance on the test set...")
y_pred_scaled = loaded_lstm_model.predict(X_test)

# Inverse transform predictions and actual values to original scale
y_test_original = loaded_target_scaler.inverse_transform(y_test_scaled)
y_pred_original = loaded_target_scaler.inverse_transform(y_pred_scaled)

rmse_lstm = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
mae_lstm = mean_absolute_error(y_test_original, y_pred_original)

print(f"Root Mean Squared Error (RMSE): {rmse_lstm:.2f}")
print(f"Mean Absolute Error (MAE): {mae_lstm:.2f}")


Evaluating LSTM model performance on the test set...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 499ms/step
Root Mean Squared Error (RMSE): 8.05
Mean Absolute Error (MAE): 7.63


In [18]:
loaded_model = load_model(MODEL_SAVE_PATH)
print("\nEvaluating CNN+BiLSTM model performance on the test set...")
y_pred_scaled = loaded_model.predict(X_test)

# Inverse transform predictions and actual values to original scale
y_test_original = loaded_target_scaler.inverse_transform(y_test_scaled)
y_pred_original = loaded_target_scaler.inverse_transform(y_pred_scaled)

rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
mae = mean_absolute_error(y_test_original, y_pred_original)

print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")


Evaluating CNN+BiLSTM model performance on the test set...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 607ms/step
Root Mean Squared Error (RMSE): 5.62
Mean Absolute Error (MAE): 4.71


In [19]:
print("\n--- Comparing Model Performance ---")

print(f"Current CNN+Bi-LSTM Model RMSE: {rmse:.2f}") # Assuming `rmse` is from your original model
print(f"CNN Model RMSE:     {rmse_cnn:.2f}")
print(f"LSTM Model RMSE:    {rmse_lstm:.2f}")

# You can also compare MAE if you like
print(f"\nCurrent CNN+Bi-LSTM Model MAE: {mae:.2f}")
print(f"CNN Model MAE:     {mae_cnn:.2f}")
print(f"LSTM Model MAE:    {mae_lstm:.2f}")

# Determine the best performing model
best_rmse = min(rmse, rmse_cnn, rmse_lstm)
if best_rmse == rmse:
    print("\nBest performing model based on RMSE: Original CNN+Bi-LSTM Model")
elif best_rmse == rmse_cnn:
    print("\nBest performing model based on RMSE: CNN Model")
else:
    print("\nBest performing model based on RMSE: LSTM Model")
best_mae = min(mae, mae_cnn, mae_lstm)
if best_rmse == rmse:
    print("\nBest performing model based on MAE: Original CNN+Bi-LSTM Model")
elif best_rmse == rmse_cnn:
    print("\nBest performing model based on MAE: CNN Model")
else:
    print("\nBest performing model based on MAE: LSTM Model")


--- Comparing Model Performance ---
Current CNN+Bi-LSTM Model RMSE: 5.62
CNN Model RMSE:     47.96
LSTM Model RMSE:    8.05

Current CNN+Bi-LSTM Model MAE: 4.71
CNN Model MAE:     47.33
LSTM Model MAE:    7.63

Best performing model based on RMSE: Original CNN+Bi-LSTM Model

Best performing model based on MAE: Original CNN+Bi-LSTM Model
