In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import yfinance as yf
 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# Data Preparation
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

# keras Models
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping

# sklearn Regressor Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# import Visualisation library
from tabulate import tabulate

import ta
from sklearn.decomposition import PCA

In [2]:
# Download historical price data from Yahoo Finance and store in a pandas DataFrame
df = yf.download("GOOGL", start='2020-01-01', end='2023-05-31', progress=False)

# Add technical indicators
# Simple Moving Average (SMA)
df['SMA_10'] = ta.trend.SMAIndicator(df['Close'], window=10).sma_indicator()

# Exponential Moving Average (EMA)
df['EMA_10'] = ta.trend.EMAIndicator(df['Close'], window=10).ema_indicator()

# Relative Strength Index (RSI)
df['RSI'] = ta.momentum.RSIIndicator(df['Close'], window=10).rsi()

# Average True Range (ATR)
df['ATR'] = ta.volatility.AverageTrueRange(
    df['High'], df['Low'], df['Close'], window=14)

# Moving Average Convergence Divergence (MACD)
macd = ta.trend.MACD(df['Close'], window_slow=26,
                    window_fast=12, window_sign=9)
df['MACD'] = macd.macd() 
df['MACD_Signal'] = macd.macd_signal()

# Add lag features, remove missing values, and scale the features
lag_periods = 10  # Number of lag periods for the lag features

# Add lag features
for i in range(1, lag_periods + 1):
    df.loc[:, f'lag_{i}'] = df['Close'].shift(i).values.copy()

# Remove rows with missing values
df.dropna(inplace=True)


# Perform Data Visualise

In [3]:
# Define your independent variables (features)
X = df.drop(['Close','Adj Close'], axis=1)
y = df['Close']

# Perform feature selection by selecting features with a correlation coefficient of at least 0.5 with the target variable
corr = X.corrwith(y)
corr_threshold = 0.5
selected_features = corr[abs(corr) > corr_threshold].index.tolist()
X = X[selected_features]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=10, shuffle=False)

# Preprocess the data by scaling it
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape the input features to fit the LSTM model
X_train_reshaped = np.reshape(X_train_scaled, (X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_test_reshaped = np.reshape(X_test_scaled, (X_test_scaled.shape[0], X_test_scaled.shape[1], 1))


  corr = X.corrwith(y)


In [12]:
# Build the LSTM model
model = Sequential()

model.add(LSTM(100, return_sequences=True, input_shape=(X_train_scaled.shape[1], 1)))
model.add(Dropout(0.2))  # Add a dropout layer with a dropout rate of 0.2
model.add(LSTM(100))
model.add(Dropout(0.2))  # Add a dropout layer with a dropout rate of 0.2
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')


In [13]:
# Train the LSTM model
lstm = model.fit(X_train_reshaped, y_train, epochs=600, batch_size=32)

Epoch 1/600
Epoch 2/600
Epoch 3/600
Epoch 4/600
Epoch 5/600
Epoch 6/600
Epoch 7/600
Epoch 8/600
Epoch 9/600
Epoch 10/600
Epoch 11/600
Epoch 12/600
Epoch 13/600
Epoch 14/600
Epoch 15/600
Epoch 16/600
Epoch 17/600
Epoch 18/600
Epoch 19/600
Epoch 20/600
Epoch 21/600
Epoch 22/600
Epoch 23/600
Epoch 24/600
Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600
Epoch 29/600
Epoch 30/600
Epoch 31/600
Epoch 32/600
Epoch 33/600
Epoch 34/600
Epoch 35/600
Epoch 36/600
Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600
Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600
Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600
Epoch 58/600
Epoch 59/600
Epoch 60/600
Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600
Epoch 75/600
Epoch 76/600
Epoch 77/600
Epoch 78

In [15]:
# Evaluate the model on the training data
y_train_pred = model.predict(X_train_reshaped)
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print("Train Set:")
print("Mean Squared Error (MSE):", mse_train)
print("Mean Absolute Error (MAE):", mae_train)
print("R-squared (R²):", r2_train)

# Evaluate the model on the testing data
y_test_pred = model.predict(X_test_reshaped)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print("Test Set:")
print("Mean Squared Error (MSE):", mse_test)
print("Mean Absolute Error (MAE):", mae_test)
print("R-squared (R²):", r2_test)

# Get the loss from the model history
print(lstm.history['loss'][-10:])

Train Set:
Mean Squared Error (MSE): 4.023882564778769
Mean Absolute Error (MAE): 1.595927735530969
R-squared (R²): 0.9945306537622001
Test Set:
Mean Squared Error (MSE): 3.3950195389902076
Mean Absolute Error (MAE): 1.4619156808564158
R-squared (R²): 0.9570159207280234
[36.578189849853516, 35.003047943115234, 32.56462097167969, 32.583335876464844, 38.01358413696289, 29.799413681030273, 36.36939239501953, 31.798667907714844, 36.428550720214844, 37.564815521240234]


In [7]:
# Visualise Prediction
