#  Automated Grid Search (With 1000 Trials)

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf 
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout
from keras_tuner.tuners import RandomSearch
from sklearn.model_selection import train_test_split, GridSearchCV

tk = 'AAPL'
ticker = yf.Ticker(tk)
df = ticker.history(interval='1d', start='2022-01-01')

df = df['Close'].values
df = df.reshape(-1, 1)

dataset_train = np.array(df[:int(df.shape[0]*0.8)])
dataset_test = np.array(df[int(df.shape[0]*0.8):])

scaler = MinMaxScaler(feature_range=(0,1))
dataset_train = scaler.fit_transform(dataset_train)
dataset_test = scaler.transform(dataset_test)

def create_dataset(df):
    x = []
    y = []
    for i in range(50, df.shape[0]):
        x.append(df[i-50:i, 0])
        y.append(df[i, 0])
    x = np.array(x)
    y = np.array(y)
    return x,y


x_train, y_train = create_dataset(dataset_train)
x_test, y_test = create_dataset(dataset_test)


x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))


# Defining the function to build the model
def build_lstm_model(hp):
    model = Sequential()
    model.add(LSTM(units=hp.Int('units_layer1', min_value=100, max_value=500, step=50),
                   return_sequences=True, input_shape=(x_train.shape[1], 1)))
    model.add(Dropout(hp.Float('dropout_rate_layer1', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(LSTM(units=hp.Int('units_layer2', min_value=100, max_value=500, step=50),
                   return_sequences=True))
    model.add(Dropout(hp.Float('dropout_rate_layer2', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(LSTM(units=hp.Int('units_layer3', min_value=100, max_value=500, step=50),
                   return_sequences=True))
    model.add(Dropout(hp.Float('dropout_rate_layer3', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(LSTM(units=hp.Int('units_layer4', min_value=100, max_value=500, step=50)))
    model.add(Dropout(hp.Float('dropout_rate_layer4', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(units=1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

# Instantiating the RandomSearch tuner (Creating tuner object)
tuner = RandomSearch(
    build_lstm_model,
    objective='val_loss',
    max_trials=1500,
    executions_per_trial=1,
    directory='my_dir',  # Specify a directory to save results
    project_name='my_project'
)

# Searching for the best hyperparameters
tuner.search(x_train, y_train, validation_data=(x_test, y_test))

# Inside the loop that trains the models
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Best hyperparameters:")
print("- units_layer1:", best_hps.get('units_layer1'))
print("- dropout_rate_layer1:", best_hps.get('dropout_rate_layer1'))
print("- units_layer2:", best_hps.get('units_layer2'))
print("- dropout_rate_layer2:", best_hps.get('dropout_rate_layer2'))
print("- units_layer3:", best_hps.get('units_layer3'))
print("- dropout_rate_layer3:", best_hps.get('dropout_rate_layer3'))
print("- units_layer4:", best_hps.get('units_layer4'))
print("- dropout_rate_layer4:", best_hps.get('dropout_rate_layer4'))

Trial 1500 Complete [00h 00m 46s]
val_loss: 0.02931946888566017

Best val_loss So Far: 0.003012970555573702
Total elapsed time: 00h 10m 09s
INFO:tensorflow:Oracle triggered exit
Best hyperparameters:
- units_layer1: 500
- dropout_rate_layer1: 0.4
- units_layer2: 500
- dropout_rate_layer2: 0.30000000000000004
- units_layer3: 500
- dropout_rate_layer3: 0.2
- units_layer4: 200
- dropout_rate_layer4: 0.4


# Combined Hyperparameter Tuning for multiple stocks

In [2]:
import numpy as np
import yfinance as yf 
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras_tuner.tuners import RandomSearch

# Define a list of stock tickers
stock_tickers = ['AAPL', 'MSFT', 'ORCL']  # Add more tickers as needed

# Create an empty array to store combined data
combined_data = []

# Loop through the stock tickers to fetch and preprocess data
for tk in stock_tickers:
    ticker = yf.Ticker(tk)
    df = ticker.history(interval='1d', start='2022-01-01', end='2023-09-11')
    df = df['Close'].values
    df = df.reshape(-1, 1)
    combined_data.append(df)

combined_data = np.concatenate(combined_data, axis=1)  # Combine data

scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(combined_data)

def create_dataset(df):
    x = []
    y = []
    for i in range(50, df.shape[0]):
        x.append(df[i-50:i, :])
        y.append(df[i, :])
    x = np.array(x)
    y = np.array(y)
    return x, y

x_train, y_train = create_dataset(scaled_data)

x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], x_train.shape[2]))

def build_lstm_model(hp):
    model = Sequential()
    model.add(LSTM(units=hp.Int('units_layer1', min_value=100, max_value=500, step=50),
                   return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])))
    model.add(Dropout(hp.Float('dropout_rate_layer1', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(LSTM(units=hp.Int('units_layer2', min_value=100, max_value=500, step=50),
                   return_sequences=True))
    model.add(Dropout(hp.Float('dropout_rate_layer2', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(LSTM(units=hp.Int('units_layer3', min_value=100, max_value=500, step=50),
                   return_sequences=True))
    model.add(Dropout(hp.Float('dropout_rate_layer3', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(LSTM(units=hp.Int('units_layer4', min_value=100, max_value=500, step=50)))
    model.add(Dropout(hp.Float('dropout_rate_layer4', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(units=x_train.shape[2]))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

tuner = RandomSearch(
    build_lstm_model,
    objective='val_loss',
    max_trials=8000,
    executions_per_trial=1,
    directory='my_dir_2',  # Specify a directory to save results
    project_name='my_project_2'
)

tuner.search(x_train, y_train, validation_split=0.2)

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Best hyperparameters:")
print("- units_layer1:", best_hps.get('units_layer1'))
print("- dropout_rate_layer1:", best_hps.get('dropout_rate_layer1'))
print("- units_layer2:", best_hps.get('units_layer2'))
print("- dropout_rate_layer2:", best_hps.get('dropout_rate_layer2'))
print("- units_layer3:", best_hps.get('units_layer3'))
print("- dropout_rate_layer3:", best_hps.get('dropout_rate_layer3'))
print("- units_layer4:", best_hps.get('units_layer4'))
print("- dropout_rate_layer4:", best_hps.get('dropout_rate_layer4'))

INFO:tensorflow:Reloading Tuner from my_dir_2\my_project_2\tuner0.json


RuntimeError: Error reloading `Oracle` from existing project. If you did not mean to reload from an existing project, change the `project_name` or pass `overwrite=True` when creating the `Tuner`. Found existing project at: my_dir_2\my_project_2

# !!!!!!!!!!!!!!!

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

# Define the hyperparameters to search
param_grid = {
    'num_lstm_units': [50, 100, 150],
    'dropout_rate': [0.2, 0.4],
    'batch_size': [32, 64],
    'epochs': [100, 150, 200]
}

# Method for data preprocessing
def preprocess_stock_data(ticker):
    # Reading the data csv
    df_price = pd.read_csv(f"{ticker}_Daily_Data.csv")
    df_news = pd.read_csv(f"{ticker}_News_Content.csv")

    # Convert the 'Date' column to datetime if it's not already
    df_price['Date'] = pd.to_datetime(df_price['Date'])

    # Specify the date from which you want to keep the rows
    start_date = pd.to_datetime('2022-03-01')  # Change this to your desired start date

    # Filter the DataFrame to keep rows from the start_date onwards
    df_price = df_price[df_price['Date'] >= start_date]

    # Reset the index of the filtered DataFrame
    df_price.reset_index(drop=True, inplace=True)

    # Extract date components
    df_news['Date'] = pd.to_datetime(df_news['time_published'], format='%Y%m%dT%H%M%S').dt.date

    # Convert date to datetime format
    df_news['Date'] = pd.to_datetime(df_news['Date'])

    # Define a custom function to determine if a given date is a weekend
    def is_weekend(date):
        return date.weekday() >= 5  # 5 and 6 represent Saturday and Sunday

    # Create a new column 'Date_adjusted' that shifts weekend dates to the next Monday
    df_news['Date'] = df_news['Date'].apply(lambda x: x + pd.DateOffset(days=2) if is_weekend(x) else x)

    # Drop unnecessary columns
    columns_to_drop = ['ticker', 'time_published', 'overall_sentiment_label', 'ticker_sentiment_label']
    df_news.drop(columns=columns_to_drop, inplace=True)

    # Group by 'Date_adjusted' and concatenate titles and summaries into a single paragraph
    agg_functions = {
        'title': ' '.join,  # Concatenate titles
        'summary': ' '.join,  # Concatenate summaries
        'overall_sentiment_score': 'mean',  # Calculate the mean
        'relevance_score': 'mean',  # Calculate the mean
        'ticker_sentiment_score': 'mean'  # Calculate the mean
    }

    df_news = df_news.groupby('Date').agg(agg_functions).reset_index()

    # Rename columns if needed
    df_news.rename(columns={'overall_sentiment_score': 'average_overall_sentiment_score',
                            'relevance_score': 'average_relevance_score',
                            'ticker_sentiment_score': 'average_ticker_sentiment_score'},
                   inplace=True)

    # Define a custom function to apply the condition
    def calculate_sentiment(score):
        if score > 0.5:
            return 1
        else:
            return 0

    # Apply the custom function to create the 'sentiment score' column
    df_news['sentiment_score'] = df_news['average_overall_sentiment_score'].apply(calculate_sentiment)

    # Convert 'Date' column to datetime type in both dataframes
    df_news['Date'] = pd.to_datetime(df_news['Date'])
    df_price['Date'] = pd.to_datetime(df_price['Date'])

    # Perform a left outer join based on the 'Date' column
    merged_df = pd.merge(df_price, df_news, on='Date', how='left')

    # Fill NaN values with 0
    merged_df.fillna(0, inplace=True)

    # Drop duplicate rows
    merged_df.drop_duplicates(inplace=True)

    # Reset the index
    merged_df.reset_index(drop=True, inplace=True)

    return merged_df

# Method to create a dataset for LSTM
def create_dataset(dataset, time_step=1):
    x = []
    y = []
    for i in range(len(dataset) - time_step - 1):
        a = dataset[i:(i + time_step), :]
        x.append(a)
        y.append(dataset[i + time_step, 0])  # Assuming the 'Close' price is in the first column
    x = np.array(x)
    y = np.array(y)
    return x, y

# Method to train the LSTM model with given hyperparameters
def train_lstm_model(x_train, y_train, time_step, close_scaler, volume_scaler, sentiment_scaler,
                     ticker_symbol, num_lstm_units, dropout_rate, batch_size, epochs):
    model = Sequential()
    model.add(LSTM(units=num_lstm_units, return_sequences=True, input_shape=(time_step, 3)))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(units=num_lstm_units, return_sequences=True))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(units=num_lstm_units, return_sequences=True))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(units=num_lstm_units))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units=1))
    model.compile(loss='mean_squared_error', optimizer='adam')

    model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, validation_split=0.1)

    model_dir = "trained_lstm_models"
    os.makedirs(model_dir, exist_ok=True)
    joblib.dump(close_scaler, os.path.join(model_dir, f"{ticker_symbol}_close_scaler.joblib"))
    joblib.dump(volume_scaler, os.path.join(model_dir, f"{ticker_symbol}_volume_scaler.joblib"))
    joblib.dump(sentiment_scaler, os.path.join(model_dir, f"{ticker_symbol}_sentiment_scaler.joblib"))
    model.save(os.path.join(model_dir, f"{ticker_symbol}_stock_prediction_with_volume_sentiment.keras"))

if __name__ == "__main__":
    stock_symbols = ['AAPL']

    for tk in stock_symbols:
        processed_df = preprocess_stock_data(tk)

        time_step = 100
        df_features = processed_df[['Close', 'Volume', 'average_overall_sentiment_score']].values
        x_train, y_train = create_dataset(df_features, time_step)
        x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 3))

        close_scaler = MinMaxScaler(feature_range=(0, 1))
        volume_scaler = MinMaxScaler(feature_range=(0, 1))
        sentiment_scaler = MinMaxScaler(feature_range=(0, 1))

        close_scaler.fit(x_train[:, :, 0])
        volume_scaler.fit(x_train[:, :, 1])
        sentiment_scaler.fit(x_train[:, :, 2])

        x_train[:, :, 0] = close_scaler.transform(x_train[:, :, 0])
        x_train[:, :, 1] = volume_scaler.transform(x_train[:, :, 1])
        x_train[:, :, 2] = sentiment_scaler.transform(x_train[:, :, 2])

        x_train = x_train.astype(np.float32)
        y_train = y_train.astype(np.float32)

        # Initialize the LSTM model
        lstm_model = Sequential()

        # Define the hyperparameter grid
        param_grid = {
            'num_lstm_units': [50, 100, 150],
            'dropout_rate': [0.2, 0.4],
            'batch_size': [32, 64],
            'epochs': [100, 150, 200]
        }

        # Create a grid search
        grid = GridSearchCV(estimator=lstm_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, verbose=2)

        # Fit the grid search to the data
        grid_result = grid.fit(x_train, y_train)

        # Print the best hyperparameters
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

        # Train the final model with the best hyperparameters
        best_params = grid_result.best_params_
        train_lstm_model(x_train, y_train, time_step, close_scaler, volume_scaler, sentiment_scaler, tk,
                         num_lstm_units=best_params['num_lstm_units'],
                         dropout_rate=best_params['dropout_rate'],
                         batch_size=best_params['batch_size'],
                         epochs=best_params['epochs'])
