In [5]:
import numpy as np
import pandas as pd
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import os
import joblib

# Fetch stock data
stock = "GOOG"
end_date = pd.Timestamp.now()
start_date = end_date - pd.DateOffset(years=10)
data = yf.download(stock, start=start_date, end=end_date)

# Prepare data for training and testing
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data[["Close"]])
window_size = 100

# Split data into training and testing sets
train_size = int(len(scaled_data) * 0.7)
train_data = scaled_data[:train_size]
test_data = scaled_data[train_size:]

# Prepare LSTM training data
x_train, y_train = [], []
for i in range(window_size, len(train_data)):
    x_train.append(train_data[i - window_size:i])
    y_train.append(train_data[i])

x_train, y_train = np.array(x_train), np.array(y_train)

# Prepare Linear Regression training data
x_train_lr = np.arange(0, len(train_data)).reshape(-1, 1)  # Time as feature
y_train_lr = train_data.reshape(-1, 1)

# Normalize features for Linear Regression
def normalize_features(x):
    return (x - np.mean(x)) / np.std(x)

x_train_lr = normalize_features(x_train_lr)

# Initialize model parameters
def initialize_params(n):
    return np.zeros((n, 1)), 0  # weights and bias (theta)

# Compute the cost function
def compute_cost(X, y, w, b):
    m = len(X)
    predictions = X.dot(w) + b
    cost = (1 / (2 * m)) * np.sum((predictions - y) ** 2)
    return cost

# Perform Gradient Descent
def gradient_descent(X, y, w, b, learning_rate, iterations):
    m = len(X)
    cost_history = []
    
    for i in range(iterations):
        predictions = X.dot(w) + b
        dw = (1 / m) * X.T.dot(predictions - y)
        db = (1 / m) * np.sum(predictions - y)
        
        w = w - learning_rate * dw
        b = b - learning_rate * db
        
        cost = compute_cost(X, y, w, b)
        cost_history.append(cost)
        
        if i % 100 == 0:
            print(f"Iteration {i}: Cost = {cost}")
    
    return w, b, cost_history

# Train the Linear Regression model using Gradient Descent
w, b, cost_history = gradient_descent(x_train_lr, y_train_lr, np.zeros((x_train_lr.shape[1], 1)), 0, 0.01, 1000)

# Save the regression model parameters (weights and bias)
np.save("saved_models/linear_regression_weights.npy", w)
np.save("saved_models/linear_regression_bias.npy", b)

# Save the scaler for future use
if not os.path.exists("saved_models"):
    os.makedirs("saved_models")
np.save("saved_models/scaler_minmax.npy", scaler.min_)
np.save("saved_models/scaler_scale.npy", scaler.scale_)

# Build and train the LSTM model (same as before)


lstm_model = Sequential()
lstm_model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(units=50, return_sequences=True))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(units=50))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(units=1))

lstm_model.compile(optimizer="adam", loss="mean_squared_error")
lstm_model.fit(x_train, y_train, epochs=20, batch_size=32)

# Save the LSTM model
lstm_model.save("saved_models/pretrained_stock_model.h5")
print("Models saved successfully.")

# Evaluate both models on test data
# LSTM evaluation
x_test, y_test = [], []
for i in range(window_size, len(test_data)):
    x_test.append(test_data[i - window_size:i])
    y_test.append(test_data[i])

x_test, y_test = np.array(x_test), np.array(y_test)
lstm_predictions = lstm_model.predict(x_test)
inv_lstm_predictions = scaler.inverse_transform(lstm_predictions)
inv_y_test = scaler.inverse_transform(y_test)

# Linear Regression evaluation
x_test_lr = np.arange(train_size, len(scaled_data)).reshape(-1, 1)
x_test_lr = normalize_features(x_test_lr)  # Normalize the test features for Linear Regression
lr_predictions = x_test_lr.dot(w) + b
inv_lr_predictions = scaler.inverse_transform(lr_predictions)

# Calculate errors
from sklearn.metrics import mean_squared_error

lstm_rmse = np.sqrt(mean_squared_error(inv_y_test, inv_lstm_predictions))
lr_rmse = np.sqrt(mean_squared_error(data["Close"][train_size:].values, inv_lr_predictions))

print(f"LSTM RMSE: {lstm_rmse}")
print(f"Linear Regression RMSE: {lr_rmse}")




[*********************100%%**********************]  1 of 1 completed


Iteration 0: Cost = 0.035481985276189544
Iteration 100: Cost = 0.007523320556476878
Iteration 200: Cost = 0.0037774277478670026
Iteration 300: Cost = 0.003275554247316674
Iteration 400: Cost = 0.0032083133988931125
Iteration 500: Cost = 0.0031993044918841515
Iteration 600: Cost = 0.003198097481452265
Iteration 700: Cost = 0.00319793576658705
Iteration 800: Cost = 0.003197914100081989
Iteration 900: Cost = 0.003197911197210686
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


  saving_api.save_model(


Models saved successfully.
LSTM RMSE: 6.134489141148141
Linear Regression RMSE: 76.13598226504504
