In [8]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBRegressor
import catboost as cb
from catboost import CatBoostRegressor
import pmdarima as pm
from pmdarima import auto_arima
from arch import arch_model
from sklearn.svm import SVR
from sklearn.linear_model import LassoCV, Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.feature_selection import mutual_info_regression, RFE, RFECV
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV, ParameterSampler, train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.cluster import AffinityPropagation
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
import tensorflow as tf
from tensorflow.keras.constraints import MaxNorm
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Conv1D, GRU, Flatten, BatchNormalization, Dropout, Input, SimpleRNN, MaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping
from keras.layers import Input
from scikeras.wrappers import KerasClassifier, KerasRegressor
from statsmodels.tsa.arima.model import ARIMA
import optuna
from optuna.integration import KerasPruningCallback
import random
import pickle
from math import sqrt
from statsmodels.tsa.statespace.sarimax import SARIMAX
import os

# **Preprocessing Data + Data Cleaning**

**Read and Combine dataset**

In [10]:
# Reading the CSV files
company_info_df = pd.read_csv('company_info.csv')
company_stock_details_df = pd.read_csv('company_stock_details.csv')

# Convert the Date column to datetime format
company_stock_details_df['Date'] = pd.to_datetime(company_stock_details_df['Date'], dayfirst=True, errors='coerce')

# Combining the dataset
data_combined = pd.merge(company_info_df, company_stock_details_df, on='Symbol', how='left')

**Replace NaN with zero for News**

In [12]:
# Checking for missing value
print(data_combined.isnull().sum())

# Display rows with missing values in "Date", "Close", or "Volume"
missing_data_rows = data_combined[data_combined[['Date', 'Close', 'Volume']].isnull().any(axis=1)]
print(missing_data_rows)

# Remove rows with missing values in "Date", "Close", or "Volume"
data_combined = data_combined.dropna(subset=['Date', 'Close', 'Volume'])

# Change the missing news volume to 0
data_combined.fillna(0, inplace=True)

# Verify that rows with missing values are removed
print(data_combined.isnull().sum())

# Dropping Duplicate value
data_combined = data_combined.drop_duplicates()

Symbol                             0
GICS Sector                        0
Headquarters Location              0
Founded                            0
Date                               2
Close                              2
Volume                             2
News - Positive Sentiment        522
News - Negative Sentiment        522
News - New Products              522
News - Layoffs                   522
News - Analyst Comments          522
News - Stocks                    522
News - Dividends                 522
News - Corporate Earnings        522
News - Mergers & Acquisitions    522
News - Store Openings            522
News - Product Recalls           522
News - Adverse Events            522
News - Personnel Changes         522
News - Stock Rumors              522
dtype: int64
      Symbol       GICS Sector Headquarters Location Founded Date  Close  \
29988  BRK.B        Financials       Omaha, Nebraska    1839  NaT    NaN   
37045   BF.B  Consumer Staples  Louisville, Kentucky    18

**Split into Train Data and Test Data**

In [14]:
# If the date column is found, retrieve unique dates
unique_dates = data_combined['Date'].unique()

# Get the date at the 80% position
cutoff_date = unique_dates[int(0.8 * len(unique_dates))]
print(cutoff_date)

# Split the data into train and test sets based on the cutoff_date
train_data = data_combined[data_combined['Date'] <= cutoff_date]
test_data = data_combined[data_combined['Date'] > cutoff_date]

# Print the shapes of the train and test datasets
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

2022-02-23 00:00:00
Train data shape: (174251, 21)
Test data shape: (43560, 21)


In [16]:
# Prepare data for RNN and LSTM
train_data_lstm = train_data.dropna()
test_data_lstm = test_data.dropna()
train_data_lstm.to_csv('train_data_lstm.csv', index=False)
test_data_lstm.to_csv('test_data_lstm.csv', index = False)

**Create dataset with lag value**

In [18]:
# Sort by 'Symbol' and then by date to maintain chronological order for each stock
train_data = train_data.sort_values(by=['Symbol', 'Date'])
test_data = test_data.sort_values(by=['Symbol', 'Date'])

# Create lag features for the past 5 days for each column
exclude_columns = ['Symbol', 'GICS Sector', 'Headquarters Location', 'Founded', 'Date']
lag_columns = [col for col in train_data.columns if col not in exclude_columns and ("news" in col.lower() or col in ['Close', 'Volume'])]

for lag in range(1, 6):
    for col in lag_columns:
        train_data[f'{col}_lag{lag}'] = train_data.groupby('Symbol')[col].shift(lag)

for lag in range(1, 6):
    for col in lag_columns:
        test_data[f'{col}_lag{lag}'] = test_data[col].shift(lag)

# Drop rows with NaN values generated due to lagging
train_data = train_data.dropna()
test_data = test_data.dropna()

# Save the new train_data to csv files
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index = False)

# Filter columns to keep only 'Close' and columns that contain 'lag'
filtered_data = train_data[[col for col in train_data.columns if col == 'Close' or 'lag' in col]]
filtered_data.to_csv('filtered_data.csv', index = False)
filtered_test = test_data[[col for col in test_data.columns if col == 'Close' or 'lag' in col]]
filtered_test.to_csv('filtered_test.csv', index = False)

# **Feature Selection**

In [None]:
# Define the target and features
target_column = 'Close'
feature_columns = [col for col in train_data.columns if 'lag' in col]

X = train_data[feature_columns]
y = train_data[target_column]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 1: Feature ranking using L1-LR (Lasso)
lasso = Lasso(alpha=0.1, random_state=42)
lasso.fit(X_scaled, y)
lasso_importance = np.abs(lasso.coef_)
lasso_top_features = [feature_columns[i] for i in np.argsort(lasso_importance)[-20:]]

# Step 2: Feature ranking using SVM (SVR)
svr = SVR(kernel='linear', C=1.0)
svr.fit(X_scaled, y)
svr_importance = np.abs(svr.coef_.ravel())
svr_top_features = [feature_columns[i] for i in np.argsort(svr_importance)[-20:]]

# Step 3: Feature ranking using Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs = -1)
rf.fit(X_scaled, y)
rf_importance = rf.feature_importances_
rf_top_features = [feature_columns[i] for i in np.argsort(rf_importance)[-20:]]

# Combine top 20 features from each method
selected_features = pd.unique(np.concatenate((lasso_top_features, svr_top_features, rf_top_features)))

# Step 4: Clustering using Affinity Propagation on selected features
X_selected = train_data[selected_features]

# Calculate linear correlation matrix
correlation_matrix = X_selected.corr().values

# Calculate information gain matrix
information_gain_matrix = np.zeros((len(selected_features), len(selected_features)))
for i in range(len(selected_features)):
    for j in range(len(selected_features)):
        if i != j:
            information_gain_matrix[i, j] = mutual_info_regression(X_selected[[selected_features[i]]], X_selected[selected_features[j]])[0]

# Combine correlation and information gain matrices
combined_similarity_matrix = (np.abs(correlation_matrix) + information_gain_matrix) / 2

# Affinity Propagation clustering
affinity_propagation = AffinityPropagation(affinity='precomputed', random_state=42)
clusters = affinity_propagation.fit_predict(combined_similarity_matrix)

# Assign clusters to features
feature_importances = pd.DataFrame({'Feature': selected_features})
feature_importances['Cluster'] = clusters

# Select exemplar features from each cluster
exemplar_features = []
for cluster_id in np.unique(clusters):
    cluster_features = feature_importances[feature_importances['Cluster'] == cluster_id]['Feature']
    exemplar_features.append(cluster_features.iloc[0])

# Top features based on Affinity Propagation
mffs_top_features = exemplar_features

In [None]:
# Define the Multi-Filters Neural Network (MFNN) as the teacher model with three distinct paths
class MFNN(tf.keras.Model):
    def __init__(self, input_shape):
        super(MFNN, self).__init__()

        # Path 1: Single Convolutional Layer
        self.single_conv = Conv1D(64, kernel_size=3, activation='relu', padding='same')

        # Path 2: Two Convolutional Layers
        self.double_conv1 = Conv1D(64, kernel_size=3, activation='relu', padding='same')
        self.double_conv2 = Conv1D(64, kernel_size=3, activation='relu', padding='same')
        self.batch_norm_conv = BatchNormalization()

        # Path 3: Recurrent Pathway (GRU)
        self.gru = GRU(64, return_sequences=True)
        self.batch_norm_rnn = BatchNormalization()

        # Dense layers for final prediction
        self.flatten = Flatten()
        self.fc1 = Dense(128, activation='relu')
        self.dropout = Dropout(0.5)
        self.fc2 = Dense(2)

    def call(self, x):
        # Path 1: Single Convolutional Layer
        x_single_conv = self.single_conv(x)

        # Path 2: Two Convolutional Layers
        x_double_conv = self.double_conv1(x)
        x_double_conv = self.double_conv2(x_double_conv)
        x_double_conv = self.batch_norm_conv(x_double_conv)

        # Path 3: Recurrent pathway
        x_rnn = self.gru(x)
        x_rnn = self.batch_norm_rnn(x_rnn)

        # Concatenate outputs from all paths
        x_combined = tf.concat([
            tf.reduce_mean(x_single_conv, axis=1),
            tf.reduce_mean(x_double_conv, axis=1),
            tf.reduce_mean(x_rnn, axis=1)
        ], axis=1)

        # Final Dense layers
        x = self.flatten(x_combined)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)

        return x

# Reshape input data for the MFNN (expected 3D input: samples, timesteps, features)
X_scaled_expanded = np.expand_dims(X_scaled, axis=1)

# Initialize and compile the MFNN model
teacher_model = MFNN(input_shape=(X_scaled_expanded.shape[1], X_scaled_expanded.shape[2]))
teacher_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train the teacher model on the full feature set to learn the representation
teacher_model.fit(X_scaled_expanded, y, epochs=50, batch_size=32, verbose=1)

# Use the MFNN model to generate 2D feature representations for the student
X_teacher_representation = teacher_model.predict(X_scaled_expanded)

# Define the student network to learn the MFNN feature representation
student_model = Sequential([
    Dense(20, activation='relu', input_shape=(X.shape[1],)),
    Dense(2)
])

# Compile the student model
student_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train the student model to learn from the MFNN representation
student_model.fit(X_scaled, X_teacher_representation, epochs=100, batch_size=32, verbose=1)

# Extract feature importance from the student model's first layer weights
# Apply row-sparsity constraint to the weight matrix of the first layer
W1 = student_model.layers[0].get_weights()[0]
feature_importance = np.mean(np.abs(W1), axis=1)

# Select the top features based on feature importance scores
top_features_idx = np.argsort(feature_importance)[-20:]
selected_features = X.columns[top_features_idx]

print("Top selected features:", selected_features)

Epoch 1/50
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 6ms/step - loss: 11844.9727
Epoch 2/50
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 6ms/step - loss: 281.0568
Epoch 3/50
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 5ms/step - loss: 267.0135
Epoch 4/50
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 6ms/step - loss: 218.5582
Epoch 5/50
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 6ms/step - loss: 169.9824
Epoch 6/50
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 7ms/step - loss: 216.9709
Epoch 7/50
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 7ms/step - loss: 186.6322
Epoch 8/50
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 7ms/step - loss: 145.0881
Epoch 9/50
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 7ms/step - loss: 131.5930
Epoch 10/50
[1m5368/5368[0m [32m━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 79447.8594
Epoch 2/100
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - loss: 2191.8862
Epoch 3/100
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 531.6158
Epoch 4/100
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1ms/step - loss: 191.3537
Epoch 5/100
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 99.4176
Epoch 6/100
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 69.6757
Epoch 7/100
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - loss: 56.8737
Epoch 8/100
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 50.4129
Epoch 9/100
[1m5368/5368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 45.1525
Epoch 10/100
[1m5368/5368[0m [32m━━━━━━

# **Hyperparameter Tuning**

**ANN using Random Grid Search**

In [None]:
# Define the target and features
target_column = 'Close'
feature_columns = [col for col in train_data.columns if 'lag' in col]
X_train = train_data[feature_columns]
y_train = train_data[target_column]

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Convert numpy arrays to DataFrames
X = pd.DataFrame(X_train_scaled, columns=X_train.columns)
y = y_train

# Expanding window time series validation and hyperparameter tuning
n_splits = 4
tscv = TimeSeriesSplit(n_splits=n_splits)

# Define the model function
def create_model(layers=3, neurons=16, activation='relu', learning_rate=0.005, dropout_rate=0.1, optimizer='adam'):
    model = Sequential()
    for _ in range(layers):
        model.add(Dense(neurons, activation=activation))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1))

    if optimizer == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer == 'sgd':
        optimizer = SGD(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss='mse', metrics=['mse'])
    return model

# Define the parameter grid
param_grid = {
    'layers': [3, 5, 7],
    'neurons': [16, 32, 64],
    'activation': ['relu', 'sigmoid'],
    'learning_rate': [0.005, 0.01],
    'dropout_rate': [0.1, 0.5],
    'optimizer': ['adam', 'sgd'],
    'batch_size': [32, 64],
    'epochs': [20, 50]
}

# Sample 20 random combinations
random.seed(42)
param_combinations = list(ParameterSampler(param_grid, n_iter=20, random_state=42))

results = []

# Evaluate each combination
for params in param_combinations:
    epochs = params.pop('epochs')
    batch_size = params.pop('batch_size')

    # Initialize a list to hold MSE for each split
    mse_list = []

    for train_index, val_index in tscv.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Create and train the model
        model = create_model(**params)
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)

        # Predict and calculate MSE
        y_pred = model.predict(X_val).flatten()

        # Check for NaN values in y_val and y_pred
        if np.isnan(y_val).any() or np.isnan(y_pred).any():
            continue

        mse = mean_squared_error(y_val, y_pred)
        mse_list.append(mse)

    # Calculate average MSE if there are valid mse values
    if mse_list:
        avg_mse = np.mean(mse_list)
        results.append((avg_mse, {**params, 'epochs': epochs, 'batch_size': batch_size}))

# Sort results by average MSE and get the top 3 combinations
results.sort(key=lambda x: x[0])
top_3 = results[:3]

# Display the top 3 combinations with the lowest average MSE
print("Top 3 hyperparameter combinations based on average MSE:")
for i, (avg_mse, params) in enumerate(top_3, start=1):
    print(f"Rank {i}: MSE = {avg_mse:.4f}, Parameters = {params}")


**ANN using Optuna**

In [None]:
# Define the target and features
target_column = 'Close'
feature_columns = [col for col in train_data.columns if 'lag' in col]
X_train = train_data[feature_columns]
y_train = train_data[target_column]

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Convert numpy arrays to DataFrames
X = pd.DataFrame(X_train_scaled, columns=X_train.columns)
y = y_train

# Expanding window time series validation and hyperparameter tuning
n_splits = 4
tscv = TimeSeriesSplit(n_splits=n_splits)

# Define the model function with parameters as trial suggestions
def create_model(trial):
    layers = trial.suggest_int("layers", 1, 7)
    neurons = trial.suggest_categorical("neurons", [16, 32, 64])
    activation = trial.suggest_categorical("activation", ["relu", "sigmoid"])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-1)
    dropout_rate = trial.suggest_uniform("dropout_rate", 0.1, 0.5)
    optimizer_name = trial.suggest_categorical("optimizer", ["adam", "sgd"])

    model = Sequential()
    for _ in range(layers):
        model.add(Dense(neurons, activation=activation))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1))

    if optimizer_name == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_name == 'sgd':
        optimizer = SGD(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss='mse', metrics=['mse'])
    return model

# Objective function to minimize MSE using Optuna
def objective(trial):
    # Extract hyperparameters for batch_size and epochs
    batch_size = trial.suggest_categorical("batch_size", [32, 64])
    epochs = trial.suggest_int("epochs", 20, 50)

    mse_list = []

    for train_index, val_index in tscv.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Create and train the model
        model = create_model(trial)
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)

        # Predict and calculate MSE
        y_pred = model.predict(X_val).flatten()

        # Skip if NaN values are detected
        if np.isnan(y_val).any() or np.isnan(y_pred).any():
            continue

        mse = mean_squared_error(y_val, y_pred)
        mse_list.append(mse)

    # Return the mean MSE across splits, or a high MSE if no valid splits
    return np.mean(mse_list) if mse_list else float("inf")

# Run Optuna study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

# Display the best parameters
print("Best hyperparameters:", study.best_params)


**GRU using Random Grid Search**

In [None]:
# Load the data
data = filtered_data

# Define target column and exclude non-numeric columns
target_col = 'Close'
non_numeric_cols = ['Symbol', 'GICS Sector', 'Headquarters Location', 'Founded', 'Date']
feature_cols = [col for col in data.columns if col not in non_numeric_cols and col != target_col]

# Initialize scalers for features and target
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# Scale features and target column
data[feature_cols] = scaler_X.fit_transform(data[feature_cols])
data[target_col] = scaler_y.fit_transform(data[[target_col]])

# Prepare data using previous 5 days' features only, excluding today's data
sequence_length = 5
X, y = [], []

for symbol in data['Symbol'].unique():
    company_data = data[data['Symbol'] == symbol].reset_index(drop=True)

    for i in range(sequence_length, len(company_data)):
        # Use the previous 5 days as input features, excluding today
        X.append(company_data[feature_cols].iloc[i-sequence_length:i].values)
        # Target is the 'Close' price of the current day
        y.append(company_data[target_col].iloc[i])

X, y = np.array(X), np.array(y)

# Define the GRU model function
def create_gru_model(layers=3, hidden_units=50, dropout_rate=0.2, learning_rate=0.005, optimizer='adam', grad_clip=1):
    model = Sequential()
    for layer in range(layers):
        return_sequences = layer < (layers - 1)
        model.add(GRU(units=hidden_units, return_sequences=return_sequences))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1))  # Output layer

    if optimizer == 'adam':
        optimizer_instance = Adam(learning_rate=learning_rate, clipvalue=grad_clip)
    elif optimizer == 'rmsprop':
        optimizer_instance = RMSprop(learning_rate=learning_rate, clipvalue=grad_clip)

    model.compile(optimizer=optimizer_instance, loss='mean_squared_error')
    return model

# Define the parameter grid
param_grid = {
    'layers': [1, 2, 3],
    'hidden_units': [75, 100],
    'learning_rate': [0.005, 0.01, 0.02],
    'dropout_rate': [0.2, 0.4],
    'optimizer': ['adam', 'rmsprop'],
    'batch_size': [32, 64],
    'epochs': [10, 50],
    'grad_clip': [1, 5]
}

# Sample 20 random combinations
random.seed(42)
param_combinations = list(ParameterSampler(param_grid, n_iter=20, random_state=42))

results = []

# Evaluate each combination
for params in param_combinations:
    epochs = params.pop('epochs')
    batch_size = params.pop('batch_size')

    # Initialize a list to hold MSE for each split
    mse_list = []

    for train_index, val_index in tscv.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Create and train the model
        model = create_gru_model(**params)
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)

        # Predict and calculate MSE
        y_pred = model.predict(X_val).flatten()

        # Reverse scaling for validation and prediction
        y_val_true = scaler_y.inverse_transform(y_val.reshape(-1, 1))
        y_pred = scaler_y.inverse_transform(y_pred.reshape(-1, 1))

        # Check for NaN values in y_val and y_pred
        if np.isnan(y_val_true).any() or np.isnan(y_pred).any():
            continue

        mse = mean_squared_error(y_val_true, y_pred)
        mse_list.append(mse)

    # Calculate average MSE if there are valid mse values
    if mse_list:
        avg_mse = np.mean(mse_list)
        results.append((avg_mse, {**params, 'epochs': epochs, 'batch_size': batch_size}))

# Sort results by average MSE and get the top 3 combinations
results.sort(key=lambda x: x[0])
top_3 = results[:3]

# Display the top 3 combinations with the lowest average MSE
print("Top 3 hyperparameter combinations based on average MSE:")
for i, (avg_mse, params) in enumerate(top_3, start=1):
    print(f"Rank {i}: MSE = {avg_mse:.4f}, Parameters = {params}")


**GRU using Optuna**

In [None]:
# Load and preprocess data
data = filtered_data
target_col = 'Close'
non_numeric_cols = ['Symbol', 'GICS Sector', 'Headquarters Location', 'Founded', 'Date']
feature_cols = [col for col in data.columns if col not in non_numeric_cols and col != target_col]

# Initialize scaler for features and target
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# Scale data
data[feature_cols] = scaler_X.fit_transform(data[feature_cols])
data[target_col] = scaler_y.fit_transform(data[[target_col]])

# Define sequence length for GRU
sequence_length = 5
X, y = [], []

# Prepare data by company, ensuring sequence consistency
for symbol in data['Symbol'].unique():
    company_data = data[data['Symbol'] == symbol]
    for i in range(len(company_data) - sequence_length):
        X.append(company_data[feature_cols].iloc[i:i + sequence_length].values)
        y.append(company_data[target_col].iloc[i + sequence_length])

X, y = np.array(X), np.array(y)

# Objective function for Optuna
def objective(trial):
    # Hyperparameter suggestions
    layers = trial.suggest_int('layers', 1, 3)
    hidden_units = trial.suggest_int('hidden_units', 50, 200)
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.1)
    optimizer_name = trial.suggest_categorical('optimizer', ['adam', 'rmsprop'])
    grad_clip = trial.suggest_int('grad_clip', 1, 5)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])
    epochs = trial.suggest_int('epochs', 10, 50)

    # Initialize list to store MSE for each fold
    mse_scores = []

    # Expanding window cross-validation
    for train_index, val_index in tscv.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Build GRU model
        model = Sequential()
        model.add(Input(shape=(X.shape[1], X.shape[2])))

        for i in range(layers):
            return_sequences = i < layers - 1
            model.add(GRU(hidden_units, return_sequences=return_sequences, kernel_constraint=MaxNorm(grad_clip)))
            if dropout_rate > 0:
                model.add(Dropout(dropout_rate))

        model.add(Dense(1))

        # Choose optimizer
        if optimizer_name == 'adam':
            optimizer = Adam(learning_rate=learning_rate, clipvalue=grad_clip)
        else:
            optimizer = RMSprop(learning_rate=learning_rate, clipvalue=grad_clip)

        # Compile model
        model.compile(optimizer=optimizer, loss='mse')

        # Early stopping
        early_stopping = EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)

        # Train model on training split
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val),
                  callbacks=[early_stopping], verbose=0)

        # Predict and calculate MSE for validation data
        y_val_pred_scaled = model.predict(X_val)
        y_val_pred = scaler_y.inverse_transform(y_val_pred_scaled)
        y_val_true = scaler_y.inverse_transform(y_val.reshape(-1, 1))

        mse = mean_squared_error(y_val_true, y_val_pred)
        mse_scores.append(mse)

    # Return the average MSE across all folds
    avg_mse = np.mean(mse_scores)
    return avg_mse

# Run Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, n_jobs=-1)

# Display best hyperparameters
print("Best hyperparameters:", study.best_params)


**RNN using Random Grid Search**

In [None]:
# Load the data
data = filtered_data

# Define target column and exclude non-numeric columns
target_col = 'Close'
non_numeric_cols = ['Symbol', 'GICS Sector', 'Headquarters Location', 'Founded', 'Date']
feature_cols = [col for col in data.columns if col not in non_numeric_cols and col != target_col]

# Initialize scalers for features and target
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# Scale features and target column
data[feature_cols] = scaler_X.fit_transform(data[feature_cols])
data[target_col] = scaler_y.fit_transform(data[[target_col]])

# Prepare data using previous 5 days' features only, excluding today's data
sequence_length = 5
X, y = [], []

for symbol in data['Symbol'].unique():
    company_data = data[data['Symbol'] == symbol].reset_index(drop=True)

    for i in range(sequence_length, len(company_data)):
        # Use the previous 5 days as input features, excluding today
        X.append(company_data[feature_cols].iloc[i-sequence_length:i].values)
        # Target is the 'Close' price of the current day
        y.append(company_data[target_col].iloc[i])

X, y = np.array(X), np.array(y)

# Define the RNN model function
def create_rnn_model(layers=3, hidden_units=50, dropout_rate=0.2, learning_rate=0.005, optimizer='adam', grad_clip=1):
    model = Sequential()
    for layer in range(layers):
        return_sequences = layer < (layers - 1)
        model.add(SimpleRNN(units=hidden_units, return_sequences=return_sequences))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1))  # Output layer

    if optimizer == 'adam':
        optimizer_instance = Adam(learning_rate=learning_rate, clipvalue=grad_clip)
    elif optimizer == 'rmsprop':
        optimizer_instance = RMSprop(learning_rate=learning_rate, clipvalue=grad_clip)

    model.compile(optimizer=optimizer_instance, loss='mean_squared_error')
    return model

# Define the parameter grid
param_grid = {
    'layers': [1, 2, 3],
    'hidden_units': [50, 75, 100],
    'learning_rate': [0.005, 0.01],
    'optimizer': ['adam', 'rmsprop'],
    'batch_size': [32, 64],
    'epochs': [20, 50],
    'grad_clip': [1, 5],
    'dropout_rate': [0.1, 0.2]
}

# Sample 20 random combinations
random.seed(42)
param_combinations = list(ParameterSampler(param_grid, n_iter=20, random_state=42))

results = []

# Evaluate each combination
for params in param_combinations:
    epochs = params.pop('epochs')
    batch_size = params.pop('batch_size')

    # Initialize a list to hold MSE for each split
    mse_list = []

    for train_index, val_index in tscv.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Create and train the model
        model = create_rnn_model(**params)
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)

        # Predict and calculate MSE
        y_pred = model.predict(X_val).flatten()

        # Reverse scaling for validation and prediction
        y_val_true = scaler_y.inverse_transform(y_val.reshape(-1, 1))
        y_pred = scaler_y.inverse_transform(y_pred.reshape(-1, 1))

        # Check for NaN values in y_val and y_pred
        if np.isnan(y_val_true).any() or np.isnan(y_pred).any():
            continue

        mse = mean_squared_error(y_val_true, y_pred)
        mse_list.append(mse)

    # Calculate average MSE if there are valid mse values
    if mse_list:
        avg_mse = np.mean(mse_list)
        results.append((avg_mse, {**params, 'epochs': epochs, 'batch_size': batch_size}))

# Sort results by average MSE and get the top 3 combinations
results.sort(key=lambda x: x[0])
top_3 = results[:3]

# Display the top 3 combinations with the lowest average MSE
print("Top 3 hyperparameter combinations based on average MSE:")
for i, (avg_mse, params) in enumerate(top_3, start=1):
    print(f"Rank {i}: MSE = {avg_mse:.4f}, Parameters = {params}")

**RNN using Optuna**

In [None]:
# Load data
data = filtered_data
target_col = 'Close'
non_numeric_cols = ['Symbol', 'GICS Sector', 'Headquarters Location', 'Founded', 'Date']
feature_cols = [col for col in data.columns if col not in non_numeric_cols and col != target_col]

# Initialize scaler for features and target
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# Scale data
data[feature_cols] = scaler_X.fit_transform(data[feature_cols])
data[target_col] = scaler_y.fit_transform(data[[target_col]])

# Define sequence length for RNN
sequence_length = 5
X, y = [], []

# Prepare data by company, keeping sequence consistency
for symbol in data['Symbol'].unique():
    company_data = data[data['Symbol'] == symbol]
    for i in range(len(company_data) - sequence_length):
        X.append(company_data[feature_cols].iloc[i:i + sequence_length].values)
        y.append(company_data[target_col].iloc[i + sequence_length])

X, y = np.array(X), np.array(y)


# Objective function for Optuna
def objective(trial):
    # Hyperparameter suggestions
    layers = trial.suggest_int('layers', 1, 3)
    hidden_units = trial.suggest_int('hidden_units', 50, 150)
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.1)
    optimizer_name = trial.suggest_categorical('optimizer', ['adam', 'rmsprop'])
    grad_clip = trial.suggest_int('grad_clip', 1, 5)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])
    epochs = trial.suggest_int('epochs', 10, 100)

    # Initialize list to store MSE for each fold
    mse_scores = []

    # Expanding window cross-validation
    for train_index, val_index in tscv.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Build RNN model
        model = Sequential()
        model.add(Input(shape=(X.shape[1], X.shape[2])))

        for i in range(layers):
            return_sequences = i < layers - 1
            model.add(SimpleRNN(hidden_units, return_sequences=return_sequences, kernel_constraint=MaxNorm(grad_clip)))
            if dropout_rate > 0:
                model.add(Dropout(dropout_rate))

        model.add(Dense(1))

        # Choose optimizer
        if optimizer_name == 'adam':
            optimizer = Adam(learning_rate=learning_rate, clipvalue=grad_clip)
        else:
            optimizer = RMSprop(learning_rate=learning_rate, clipvalue=grad_clip)

        # Compile model
        model.compile(optimizer=optimizer, loss='mse')

        # Early stopping
        early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

        # Train model on training split
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val),
                  callbacks=[early_stopping], verbose=0)

        # Predict and calculate MSE for validation data
        y_val_pred_scaled = model.predict(X_val)
        y_val_pred = scaler_y.inverse_transform(y_val_pred_scaled)
        y_val_true = scaler_y.inverse_transform(y_val.reshape(-1, 1))

        mse = mean_squared_error(y_val_true, y_val_pred)
        mse_scores.append(mse)

    # Return the average MSE across all folds
    avg_mse = np.mean(mse_scores)
    return avg_mse

# Run Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, n_jobs=-1)

# Display best hyperparameters
print("Best hyperparameters:", study.best_params)




**LSTM using Random Grid Search**

In [None]:
# Load the data
data = filtered_data

# Define target column and exclude non-numeric columns
target_col = 'Close'
non_numeric_cols = ['Symbol', 'GICS Sector', 'Headquarters Location', 'Founded', 'Date']
feature_cols = [col for col in data.columns if col not in non_numeric_cols and col != target_col]

# Initialize scalers for features and target
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# Scale features and target column
data[feature_cols] = scaler_X.fit_transform(data[feature_cols])
data[target_col] = scaler_y.fit_transform(data[[target_col]])

# Prepare data using previous 5 days' features only, excluding today's data
sequence_length = 5
X, y = [], []

for symbol in data['Symbol'].unique():
    company_data = data[data['Symbol'] == symbol].reset_index(drop=True)

    for i in range(sequence_length, len(company_data)):
        # Use the previous 5 days as input features, excluding today
        X.append(company_data[feature_cols].iloc[i-sequence_length:i].values)
        # Target is the 'Close' price of the current day
        y.append(company_data[target_col].iloc[i])

X, y = np.array(X), np.array(y)

# Define the model function
def create_lstm_model(layers=3, hidden_units=50, dropout_rate=0.2, learning_rate=0.005, optimizer='adam'):
    model = Sequential()
    for layer in range(layers):
        return_sequences = layer < (layers - 1)
        model.add(LSTM(units=hidden_units, return_sequences=return_sequences))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1))  # Output layer

    if optimizer == 'adam':
        optimizer_instance = Adam(learning_rate=learning_rate)
    elif optimizer == 'rmsprop':
        optimizer_instance = RMSprop(learning_rate=learning_rate)

    model.compile(optimizer=optimizer_instance, loss='mean_squared_error')
    return model

# Define the parameter grid
param_grid = {
    'layers': [3, 5, 7],
    'hidden_units': [50, 75, 100],
    'learning_rate': [0.005, 0.01],
    'dropout_rate': [0.2, 0.4],
    'optimizer': ['adam', 'rmsprop'],
    'batch_size': [32, 64],
    'epochs': [20, 50]
}

# Sample 20 random combinations
random.seed(42)
param_combinations = list(ParameterSampler(param_grid, n_iter=20, random_state=42))

# TimeSeriesSplit for expanding window cross-validation
tscv = TimeSeriesSplit(n_splits=4)

results = []

# Evaluate each combination
for params in param_combinations:
    epochs = params.pop('epochs')
    batch_size = params.pop('batch_size')

    # Initialize a list to hold MSE for each split
    mse_list = []

    for train_index, val_index in tscv.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Create and train the model
        model = create_lstm_model(**params)
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)

        # Predict and calculate MSE
        y_pred = model.predict(X_val).flatten()

        # Reverse scaling for validation and prediction
        y_val_true = scaler_y.inverse_transform(y_val.reshape(-1, 1))
        y_pred = scaler_y.inverse_transform(y_pred.reshape(-1, 1))

        # Check for NaN values in y_val and y_pred
        if np.isnan(y_val_true).any() or np.isnan(y_pred).any():
            continue

        mse = mean_squared_error(y_val_true, y_pred)
        mse_list.append(mse)

    # Calculate average MSE if there are valid mse values
    if mse_list:
        avg_mse = np.mean(mse_list)
        results.append((avg_mse, {**params, 'epochs': epochs, 'batch_size': batch_size}))

# Sort results by average MSE and get the top 3 combinations
results.sort(key=lambda x: x[0])
top_3 = results[:3]

# Display the top 3 combinations with the lowest average MSE
print("Top 3 hyperparameter combinations based on average MSE:")
for i, (avg_mse, params) in enumerate(top_3, start=1):
    print(f"Rank {i}: MSE = {avg_mse:.4f}, Parameters = {params}")


**LSTM using Optuna**

In [None]:
# Load the data
data = filtered_data

# Define target column and exclude non-numeric columns
target_col = 'Close'
non_numeric_cols = ['Symbol', 'GICS Sector', 'Headquarters Location', 'Founded', 'Date']
feature_cols = [col for col in data.columns if col not in non_numeric_cols and col != target_col]

# Initialize scalers for features and target
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# Scale features and target column
data[feature_cols] = scaler_X.fit_transform(data[feature_cols])
data[target_col] = scaler_y.fit_transform(data[[target_col]])

# Prepare data using previous 5 days' features only, excluding today's data
sequence_length = 5
X, y = [], []

for symbol in data['Symbol'].unique():
    company_data = data[data['Symbol'] == symbol].reset_index(drop=True)

    for i in range(sequence_length, len(company_data)):
        # Use the previous 5 days as input features, excluding today
        X.append(company_data[feature_cols].iloc[i-sequence_length:i].values)
        # Target is the 'Close' price of the current day
        y.append(company_data[target_col].iloc[i])

X, y = np.array(X), np.array(y)

# Define the objective function for Optuna with expanding window validation
def objective(trial):
    # Suggest hyperparameters
    n_layers = trial.suggest_int('n_layers', 1, 3)
    n_units = trial.suggest_int('n_units', 50, 200)
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])
    epochs = trial.suggest_int('epochs', 10, 50)

    # Expanding window split for time series
    tscv = TimeSeriesSplit(n_splits=4)
    val_mse_scores = []

    for train_index, val_index in tscv.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Build LSTM model with a variable number of LSTM layers
        model = Sequential()
        model.add(Input(shape=(X_train.shape[1], X_train.shape[2])))

        for layer in range(n_layers):
            # Add LSTM layers as per the suggested number of layers
            return_sequences = layer < (n_layers - 1)  # Only set `return_sequences=True` for all but the last layer
            model.add(LSTM(units=n_units, return_sequences=return_sequences))
            model.add(Dropout(dropout_rate))

        # Dense output layer
        model.add(Dense(1))

        # Compile model
        optimizer = Adam(learning_rate=learning_rate)
        model.compile(optimizer=optimizer, loss='mean_squared_error')

        # Early stopping and pruning callback
        early_stopping = EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
        pruning_callback = KerasPruningCallback(trial, monitor="val_loss")

        # Train model with pruning callback
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
                  validation_data=(X_val, y_val), callbacks=[early_stopping, pruning_callback], verbose=0)

        # Predict on the validation set and calculate MSE
        y_val_pred = model.predict(X_val)
        y_val_true = scaler_y.inverse_transform(y_val.reshape(-1, 1))  # Reverse scaling
        y_val_pred = scaler_y.inverse_transform(y_val_pred)  # Reverse scaling

        val_mse = mean_squared_error(y_val_true, y_val_pred)
        val_mse_scores.append(val_mse)

    # Return average MSE across the expanding window splits
    avg_val_mse = np.mean(val_mse_scores)
    return avg_val_mse


# Run Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, n_jobs = -1)

# Print best hyperparameters
print("Best hyperparameters:", study.best_params)
print("Best MSE:", study.best_value)

**CatBoost using Random Grid Search**

In [None]:
# Define the parameter grid for CatBoost
param_grid_catboost = {
    'iterations': [50, 100, 150],
    'depth': [5, 10, 15],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5],
    'border_count': [32, 64, 128],
    'subsample': [0.7, 0.8, 1.0],
}

# Sample 50 random combinations of hyperparameters
param_combinations_catboost = list(ParameterSampler(param_grid_catboost, n_iter=50, random_state=42))

# Store the best parameters and the lowest MSE
best_params_catboost = None
lowest_mse_catboost = float("inf")

# Hyperparameter tuning on the whole training set with all features for CatBoost
for params in param_combinations_catboost:
    mse_scores = []  # To track MSE scores across splits

    # Expanding window cross-validation
    for train_idx, val_idx in tscv.split(X_all):
        X_train, X_val = X_all[train_idx], X_all[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Initialize the model with current hyperparameters
        model = CatBoostRegressor(**params, random_state=42, silent=True, thread_count=-1)

        # Train and validate on the current split
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        mse_scores.append(mse)

    # Calculate the average MSE across all splits
    avg_mse = np.mean(mse_scores)

    # Update the best parameters if the current average MSE is lower
    if avg_mse < lowest_mse_catboost:
        lowest_mse_catboost = avg_mse
        best_params_catboost = params

print(f"Best hyperparameters found for CatBoost: {best_params_catboost}")
print(f"Lowest MSE for CatBoost on full feature set: {lowest_mse_catboost:.4f}")


**CatBoost using Optuna**

In [None]:
# Define the target and features
target_column = 'Close'
feature_columns = [col for col in train_data.columns if 'lag' in col]

X_train = train_data[feature_columns]
y_train = train_data[target_column]

# Expanding window time series validation
n_splits = 4
tscv = TimeSeriesSplit(n_splits=n_splits)

# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to be tuned for CatBoost
    param = {
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'random_strength': trial.suggest_int('random_strength', 1, 20),
        'random_seed': trial.suggest_int('random_seed', 1, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
        'iterations': trial.suggest_categorical('iterations', [100, 200, 500, 1000]),
        'depth': trial.suggest_int('depth', 3, 12),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.3, 1.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0)
    }

    # List to store MSE for each fold
    mse_list = []

    # Perform time series cross-validation
    for train_index, test_index in tscv.split(X_train):
        X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        # Fit model with the current parameter combination
        model = CatBoostRegressor(**param, verbose=0)
        model.fit(X_train_fold, y_train_fold)

        # Predict on the test fold
        y_pred_fold = model.predict(X_test_fold)

        # Calculate MSE for the fold
        mse_fold = mean_squared_error(y_test_fold, y_pred_fold)
        mse_list.append(mse_fold)

    # Return the average MSE across all folds
    return np.mean(mse_list)

# Run Optuna to find the best hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

# Get the best parameters and best MSE from the study
best_params = study.best_params
best_mse = study.best_value

# Output the best parameter combination and average MSE
print("Best parameter combination:", best_params)
print("Best average MSE:", best_mse)


**XGBoost using Random Grid Search**

In [None]:
# Define TimeSeriesSplit for expanding window cross-validation
n_splits = 4
tscv = TimeSeriesSplit(n_splits=n_splits)

# Define the target and features
target_column = 'Close'
feature_columns = [col for col in train_data.columns if 'lag' in col]

X_train_data = train_data[feature_columns]
y_train_data = train_data[target_column]

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_data)

# Convert numpy arrays to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train_data.columns)

# Assuming 'train_data' is already loaded and scaled
X_all = X_train_scaled_df.values
y = train_data['Close'].values

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [1, 1.5, 2]
}

# Sample 50 random combinations of hyperparameters
param_combinations_xgb = list(ParameterSampler(param_grid_xgb, n_iter=50, random_state=42))

# Store the best parameters and the lowest MSE
best_params_xgb = None
lowest_mse_xgb = float("inf")

# Hyperparameter tuning on the whole training set with all features for XGBoost
for params in param_combinations_xgb:
    mse_scores = []  # To track MSE scores across splits

    # Expanding window cross-validation
    for train_idx, val_idx in tscv.split(X_all):
        X_train, X_val = X_all[train_idx], X_all[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Initialize the model with current hyperparameters
        model = XGBRegressor(**params, random_state=42, n_jobs=-1)

        # Train and validate on the current split
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        mse_scores.append(mse)

    # Calculate the average MSE across all splits
    avg_mse = np.mean(mse_scores)

    # Update the best parameters if the current average MSE is lower
    if avg_mse < lowest_mse_xgb:
        lowest_mse_xgb = avg_mse
        best_params_xgb = params

print(f"Best hyperparameters found for XGBoost: {best_params_xgb}")
print(f"Lowest MSE for XGBoost on full feature set: {lowest_mse_xgb:.4f}")


**XGBoost using Optuna**

In [None]:
# Define the target and features
target_column = 'Close'
feature_columns = [col for col in train_data.columns if 'lag' in col]

X_train = train_data[feature_columns]
y_train = train_data[target_column]

# Expanding window time series validation
n_splits = 4
tscv = TimeSeriesSplit(n_splits=n_splits)

# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to be tuned
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'gamma': trial.suggest_float('gamma', 0.0, 0.5, step=0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0, step=0.1),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_categorical('n_estimators', [100, 200, 500, 1000]),
        'alpha': trial.suggest_loguniform('alpha', 0.01, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 1.0, 5.0)
    }

    # List to store MSE for each fold
    mse_list = []

    # Perform time series cross-validation
    for train_index, test_index in tscv.split(X_train):
        X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        # Fit model with the current parameter combination
        model = xgb.XGBRegressor(**param, n_jobs=-1)
        model.fit(X_train_fold, y_train_fold)

        # Predict on the test fold
        y_pred_fold = model.predict(X_test_fold)

        # Calculate MSE for the fold
        mse_fold = mean_squared_error(y_test_fold, y_pred_fold)
        mse_list.append(mse_fold)

    # Return the average MSE across all folds
    return np.mean(mse_list)

# Run Optuna to find the best hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

# Get the best parameters and best MSE from the study
best_params = study.best_params
best_mse = study.best_value

# Output the best parameter combination and average MSE
print("Best parameter combination:", best_params)
print("Best average MSE:", best_mse)


**Random Forest using Random Grid Search**

In [None]:
# Define TimeSeriesSplit for expanding window cross-validation
n_splits = 4
tscv = TimeSeriesSplit(n_splits=n_splits)

# Define the target and features
target_column = 'Close'
feature_columns = [col for col in train_data.columns if 'lag' in col]

X_train_data = train_data[feature_columns]
y_train_data = train_data[target_column]

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_data)

# Convert numpy arrays to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train_data.columns)

# Assuming 'train_data' is already loaded and scaled
X_all = X_train_scaled_df.values
y = train_data['Close'].values

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt'],
    'bootstrap': [True, False]
}

# Sample 50 random combinations of hyperparameters
param_combinations = list(ParameterSampler(param_grid, n_iter=50, random_state=42))

# Store the best parameters and the lowest MSE
best_params = None
lowest_mse = float("inf")

# Hyperparameter tuning on the whole training set with all features
for params in param_combinations:
    mse_scores = []  # To track MSE scores across splits

    # Expanding window cross-validation
    for train_idx, val_idx in tscv.split(X_all):
        X_train, X_val = X_all[train_idx], X_all[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Initialize the model with current hyperparameters
        model = RandomForestRegressor(**params, random_state=42, n_jobs = -1)

        # Train and validate on the current split
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        mse_scores.append(mse)

    # Calculate the average MSE across all splits
    avg_mse = np.mean(mse_scores)

    # Update the best parameters if the current average MSE is lower
    if avg_mse < lowest_mse:
        lowest_mse = avg_mse
        best_params = params

print(f"Best hyperparameters found on the full feature set: {best_params}")
print(f"Lowest MSE on full feature set: {lowest_mse:.4f}")



**Random Forest using Optuna**


In [None]:
# Define the target and features
target_column = 'Close'
feature_columns = [col for col in train_data.columns if 'lag' in col]

X_train = train_data[feature_columns]
y_train = train_data[target_column]

# Expanding window time series validation
n_splits = 4
tscv = TimeSeriesSplit(n_splits=n_splits)

# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to be tuned for Random Forest
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4),
        'max_features': trial.suggest_categorical('max_features', [None,'sqrt', 'log2']),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    # List to store MSE for each fold
    mse_list = []

    # Perform time series cross-validation
    for train_index, test_index in tscv.split(X_train):
        X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        # Fit model with the current parameter combination
        model = RandomForestRegressor(**param, n_jobs=-1, random_state=42)
        model.fit(X_train_fold, y_train_fold)

        # Predict on the test fold
        y_pred_fold = model.predict(X_test_fold)

        # Calculate MSE for the fold
        mse_fold = mean_squared_error(y_test_fold, y_pred_fold)
        mse_list.append(mse_fold)

    # Return the average MSE across all folds
    return np.mean(mse_list)

# Run Optuna to find the best hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

# Get the best parameters and best MSE from the study
best_params = study.best_params
best_mse = study.best_value

# Output the best parameter combination and average MSE
print("Best parameter combination:", best_params)
print("Best average MSE:", best_mse)


**Decision Tree using Random Grid Search**

In [None]:
# Define TimeSeriesSplit for expanding window cross-validation
n_splits = 4
tscv = TimeSeriesSplit(n_splits=n_splits)

# Define the parameter grid for Decision Tree
param_grid = {
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# Sample 50 random combinations of hyperparameters
param_combinations = list(ParameterSampler(param_grid, n_iter=50, random_state=42))

# Function to perform cross-validation with TimeSeriesSplit
def tune_decision_tree(X, y, param_combinations, tscv):
    best_params = None
    lowest_mse = float("inf")

    # Iterate through all sampled parameter combinations
    for params in param_combinations:
        mse_scores = []

        # Expanding window cross-validation
        for train_idx, val_idx in tscv.split(X):
            X_cv_train, X_cv_val = X[train_idx], X[val_idx]
            y_cv_train, y_cv_val = y[train_idx], y[val_idx]

            # Initialize and fit Decision Tree with current parameters
            model = DecisionTreeRegressor(**params, random_state=42)
            model.fit(X_cv_train, y_cv_train)
            y_pred = model.predict(X_cv_val)

            # Calculate MSE for the current fold
            mse = mean_squared_error(y_cv_val, y_pred)
            mse_scores.append(mse)

        # Calculate average MSE across splits
        avg_mse = np.mean(mse_scores)

        # Update the best parameters if the current average MSE is lower
        if avg_mse < lowest_mse:
            lowest_mse = avg_mse
            best_params = params

    return best_params, lowest_mse

# Perform hyperparameter tuning on the complete feature set
X_all = pd.DataFrame(X_train_scaled, columns=X_train.columns).values
y = y_train.values
best_decision_tree_params, lowest_mse_all_features = tune_decision_tree(X_all, y, param_combinations, tscv)
print(f"Best Decision Tree hyperparameters for all features: {best_decision_tree_params}")
print(f"Lowest MSE on all features: {lowest_mse_all_features:.4f}")


# **Train and Test Models**

**Defining Features from each Feature Selction Methods**

In [41]:
#Defining top features for each feature selection

mfnn_top_features = ['News - Negative Sentiment_lag4', 'News - Stocks_lag2',
       'News - New Products_lag1', 'News - Personnel Changes_lag2',
       'News - Negative Sentiment_lag3', 'News - Positive Sentiment_lag2',
       'News - Negative Sentiment_lag2', 'News - Adverse Events_lag2',
       'News - Positive Sentiment_lag1', 'News - New Products_lag3',
       'News - Store Openings_lag1', 'News - Positive Sentiment_lag3',
       'Volume_lag4', 'News - Positive Sentiment_lag4',
       'News - Corporate Earnings_lag1', 'Volume_lag3', 'Close_lag3',
       'Close_lag2', 'Close_lag4', 'Close_lag1']

close_prices = ['Close_lag1',
                'Close_lag2',
                'Close_lag3',
                'Close_lag4',
                'Close_lag5']

lasso_top_features = ['News - Positive Sentiment_lag3',
 'Volume_lag3',
 'News - Stock Rumors_lag2',
 'News - Personnel Changes_lag2',
 'News - Adverse Events_lag2',
 'News - Negative Sentiment_lag2',
 'News - Negative Sentiment_lag3',
 'News - Product Recalls_lag2',
 'News - Mergers & Acquisitions_lag2',
 'News - Corporate Earnings_lag2',
 'News - Dividends_lag2',
 'News - Stocks_lag2',
 'News - Analyst Comments_lag2',
 'News - Layoffs_lag2',
 'News - Store Openings_lag2',
 'News - New Products_lag3',
 'Close_lag3',
 'Close_lag5',
 'Close_lag4',
 'Close_lag1']

svr_top_features = ['News - Negative Sentiment_lag2',
 'News - Positive Sentiment_lag1',
 'News - Adverse Events_lag3',
 'Volume_lag3',
 'News - Product Recalls_lag4',
 'News - Negative Sentiment_lag5',
 'News - Corporate Earnings_lag1',
 'Volume_lag4',
 'News - Product Recalls_lag1',
 'News - Analyst Comments_lag2',
 'News - Product Recalls_lag5',
 'News - Analyst Comments_lag4',
 'News - Analyst Comments_lag3',
 'News - Stocks_lag5',
 'News - Analyst Comments_lag5',
 'Close_lag4',
 'Close_lag5',
 'Close_lag3',
 'Close_lag2',
 'Close_lag1']


rf_top_features = ['News - Mergers & Acquisitions_lag5',
 'News - Analyst Comments_lag4',
 'News - Stocks_lag5',
 'News - Analyst Comments_lag2',
 'News - Stocks_lag4',
 'News - Analyst Comments_lag3',
 'News - Stocks_lag2',
 'News - Analyst Comments_lag5',
 'News - Mergers & Acquisitions_lag1',
 'News - Corporate Earnings_lag1',
 'Volume_lag4',
 'Volume_lag3',
 'Volume_lag1',
 'Volume_lag5',
 'Volume_lag2',
 'Close_lag5',
 'Close_lag4',
 'Close_lag3',
 'Close_lag2',
 'Close_lag1']


mffs_top_features = ['Volume_lag3',
 'News - Positive Sentiment_lag3',
 'News - Stock Rumors_lag2',
 'News - Dividends_lag2',
 'News - Layoffs_lag2',
 'Close_lag3',
 'News - Corporate Earnings_lag2',
 'News - Product Recalls_lag2',
 'News - Product Recalls_lag4',
 'News - Analyst Comments_lag4',
 'News - Stocks_lag5']

**Defining Dataset**

In [43]:
# Convert numpy arrays to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train_data.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test_data.columns)

# Subset columns for training data based on selected features
X_whole_train_all = X_train_scaled_df.copy()
X_mfnn_train_all = X_train_scaled_df.loc[:, mfnn_top_features]
X_lasso_train_all = X_train_scaled_df.loc[:, lasso_top_features]
X_svr_train_all = X_train_scaled_df.loc[:, svr_top_features]
X_rf_train_all = X_train_scaled_df.loc[:, rf_top_features]
X_mffs_train_all = X_train_scaled_df.loc[:, mffs_top_features]
X_close_train_all = X_train_scaled_df.loc[:, close_prices]

#create X__test_all
X_whole_test_all = X_test_scaled_df.copy()
X_mfnn_test_all = X_test_scaled_df.loc[:, mfnn_top_features]
X_lasso_test_all = X_test_scaled_df.loc[:, lasso_top_features]
X_svr_test_all = X_test_scaled_df.loc[:, svr_top_features]
X_rf_test_all = X_test_scaled_df.loc[:, rf_top_features]
X_mffs_test_all = X_test_scaled_df.loc[:, mffs_top_features]
X_close_test_all = X_test_scaled_df.loc[:, close_prices]

# Create y
y_train_all = y_train_data.copy()

# Get indices of train_data that match the selected symbols
selected_indices_aapl = train_data[train_data['Symbol'] == 'AAPL'].index

X_whole_test_aapl = X_whole_test_all[test_data['Symbol'] == 'AAPL']
X_mfnn_test_aapl = X_mfnn_test_all[test_data['Symbol'] == 'AAPL']
X_lasso_test_aapl = X_lasso_test_all[test_data['Symbol'] == 'AAPL']
X_svr_test_aapl = X_svr_test_all[test_data['Symbol'] == 'AAPL']
X_rf_test_aapl = X_rf_test_all[test_data['Symbol'] == 'AAPL']
X_mffs_test_aapl = X_mffs_test_all[test_data['Symbol'] == 'AAPL']
X_close_test_aapl = X_close_test_all[test_data['Symbol'] == 'AAPL']

# Get indices of test_data that match the selected symbols
selected_indices_aapl = test_data[test_data['Symbol'] == 'AAPL'].index

# Filter y_test based on the selected indices
y_test_aapl = y_test_data.loc[selected_indices_aapl]

# Define datasets dictionary
datasets_all = {
    'Whole_all': (X_whole_train_all, X_whole_test_aapl),
    'MFNN_Top_all': (X_mfnn_train_all, X_mfnn_test_aapl),
    'Lasso_Top_all': (X_lasso_train_all, X_lasso_test_aapl),
    'SVR_Top_all': (X_svr_train_all, X_svr_test_aapl),
    'RF_Top_all': (X_rf_train_all, X_rf_test_aapl),
    'MFFS_Top_all': (X_mffs_train_all, X_mffs_test_aapl),
    'Close_Prices_all': (X_close_train_all, X_close_test_aapl)
}

# Creating train_data for some companies

# Get unique company symbols
symbols = train_data['Symbol'].unique()

# Select half of the remaining symbols
half_symbols = symbols[:len(symbols) // 2]

X_whole_train_some = X_whole_train_all[~train_data['Symbol'].isin(half_symbols)]
X_mfnn_train_some = X_mfnn_train_all[~train_data['Symbol'].isin(half_symbols)]
X_lasso_train_some = X_lasso_train_all[~train_data['Symbol'].isin(half_symbols)]
X_svr_train_some = X_svr_train_all[~train_data['Symbol'].isin(half_symbols)]
X_rf_train_some = X_rf_train_all[~train_data['Symbol'].isin(half_symbols)]
X_mffs_train_some = X_mffs_train_all[~train_data['Symbol'].isin(half_symbols)]
X_close_train_some = X_close_train_all[~train_data['Symbol'].isin(half_symbols)]

# Get indices of train_data that match the selected symbols
selected_indices = train_data[~train_data['Symbol'].isin(half_symbols)].index

# Filter y_train based on the selected indices
y_train_some = y_train_data.loc[selected_indices]

datasets_some = {
    'Whole_some': (X_whole_train_some, X_whole_test_aapl),
    'MFNN_Top_some': (X_mfnn_train_some, X_mfnn_test_aapl),
    'Lasso_Top_some': (X_lasso_train_some, X_lasso_test_aapl),
    'SVR_Top_some': (X_svr_train_some, X_svr_test_aapl),
    'RF_Top_some': (X_rf_train_some, X_rf_test_aapl),
    'MFFS_Top_some': (X_mffs_train_some, X_mffs_test_aapl),
    'Close_Prices_some': (X_close_train_some, X_close_test_aapl)
}

# AAPL
X_whole_train_aapl = X_whole_train_all[train_data['Symbol'] == 'AAPL']
X_mfnn_train_aapl = X_mfnn_train_all[train_data['Symbol'] == 'AAPL']
X_lasso_train_aapl = X_lasso_train_all[train_data['Symbol'] == 'AAPL']
X_svr_train_aapl = X_svr_train_all[train_data['Symbol'] == 'AAPL']
X_rf_train_aapl = X_rf_train_all[train_data['Symbol'] == 'AAPL']
X_mffs_train_aapl = X_mffs_train_all[train_data['Symbol'] == 'AAPL']
X_close_train_aapl = X_close_train_all[train_data['Symbol'] == 'AAPL']

# Get indices of train_data that match the selected symbols
selected_indices_aapl = train_data[train_data['Symbol'] == 'AAPL'].index

# Filter y_train based on the selected indices
y_train_aapl = y_train_data.loc[selected_indices_aapl]

datasets_aapl = {
    'Whole_aapl': (X_whole_train_aapl, X_whole_test_aapl),
    'MFNN_Top_aapl': (X_mfnn_train_aapl, X_mfnn_test_aapl),
    'Lasso_Top_aapl': (X_lasso_train_aapl, X_lasso_test_aapl),
    'SVR_Top_aapl': (X_svr_train_aapl, X_svr_test_aapl),
    'RF_Top_aapl': (X_rf_train_aapl, X_rf_test_aapl),
    'MFFS_Top_aapl': (X_mffs_train_aapl, X_mffs_test_aapl),
    'Close_Prices_aapl': (X_close_train_aapl, X_close_test_aapl)
}

# Define filters for each company in the train and test datasets
aapl_train_filter = train_data['Symbol'] == 'AAPL'
aapl_test_filter = test_data['Symbol'] == 'AAPL'

# Select only the closing price columns for each company in train and test datasets
unscaled_X_train_close_aapl = X_train_data.loc[aapl_train_filter, close_prices]
unscaled_X_test_close_aapl = X_test_data.loc[aapl_test_filter, close_prices]

# Convert 'Date' column to datetime format
train_data['Date'] = pd.to_datetime(train_data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])

# Prepare data for ARIMA by setting date to index and filtering only for each company

# For AAPL
train_data_aapl = train_data[train_data['Symbol'] == 'AAPL'].copy()
train_data_aapl.set_index('Date', inplace=True)

test_data_aapl = test_data[test_data['Symbol'] == 'AAPL'].copy()
test_data_aapl.set_index('Date', inplace=True)

# AAPL Close price data for ARIMA
close_only_train_aapl = train_data_aapl['Close']
close_only_test_aapl = test_data_aapl['Close']

# Define all datasets and their respective y_train and y_test
datasets_dict = {
    'All_Company': (datasets_all, y_train_all, y_test_aapl),
    'Some_Company': (datasets_some, y_train_some, y_test_aapl),
    'AAPL': (datasets_aapl, y_train_aapl, y_test_aapl),
}


  train_data = pd.read_csv("train_data.csv")


In [None]:
# Preparing Results
n_splits = 4
tscv = TimeSeriesSplit(n_splits=n_splits)

results = []

**Linear Regression**

In [None]:
# Initialize the Linear Regression model
linear_model = LinearRegression()

# Function to evaluate model, save results, and append to `results`
def evaluate_and_save_results_linear(dataset_name, X_train, X_test, y_train, y_test, method_label, results_folder):
    # Create the folder if it does not exist
    os.makedirs(results_folder, exist_ok=True)
    
    # Fit the model and make predictions
    linear_model.fit(X_train, y_train)
    y_pred_train_values = linear_model.predict(X_train)
    y_pred_test_values = linear_model.predict(X_test)
    
    # Create DataFrames for train and test predictions with unique names
    y_pred_train_df = pd.DataFrame(y_pred_train_values, index=y_train.index, columns=[f'y_pred_{dataset_name}_train'])
    y_pred_test_df = pd.DataFrame(y_pred_test_values, index=y_test.index, columns=[f'y_pred_{dataset_name}_test'])
    
    # Calculate metrics
    r2_train = r2_score(y_train, y_pred_train_values)
    rmse_train = sqrt(mean_squared_error(y_train, y_pred_train_values))
    mae_train = mean_absolute_error(y_train, y_pred_train_values)

    r2_test = r2_score(y_test, y_pred_test_values)
    rmse_test = sqrt(mean_squared_error(y_test, y_pred_test_values))
    mae_test = mean_absolute_error(y_test, y_pred_test_values)
    
    # Append results to the already defined `results` list
    results.append({
        'Dataset': dataset_name,
        'Method': method_label,
        'Model': 'Linear Regression',
        'R2_Train': r2_train,
        'RMSE_Train': rmse_train,
        'MAE_Train': mae_train,
        'R2_Test': r2_test,
        'RMSE_Test': rmse_test,
        'MAE_Test': mae_test
    })

    # Save the model using pickle with a filename that includes the dataset and method
    model_filename = os.path.join(results_folder, f'linear_model_{method_label}_{dataset_name}.pkl')
    with open(model_filename, 'wb') as model_file:
        pickle.dump(linear_model, model_file)

    # Plot actual vs predicted for both training and testing
    for data_type, y_true, y_pred_df, title_suffix in zip(
        ['train', 'test'],
        [y_train, y_test],
        [y_pred_train_df, y_pred_test_df],
        ['Training', 'Testing']
    ):
        plt.figure(figsize=(10, 6))
        plt.plot(y_true.index, y_true, label=f'Actual {title_suffix}', color='blue')
        plt.plot(y_true.index, y_pred_df.iloc[:, 0], label=f'Predicted {title_suffix}', color='red')
        plt.title(f'Linear Regression - Actual vs Predicted Closing Price ({title_suffix}) - {dataset_name}')
        plt.xlabel('Data point')
        plt.ylabel('Closing Price')
        plt.legend()
        plot_filename = os.path.join(results_folder, f'actual_vs_predicted_{data_type}_Linear_Regression_{dataset_name}.png')
        plt.savefig(plot_filename)
        plt.close()

# Define the folder to save all results
results_folder = 'linear_model_results'

# Loop through each dataset type and evaluate, storing specified predictions in dictionary
for method_label, (datasets, y_train, y_test) in datasets_dict.items():
    for dataset_name, (X_train, X_test) in datasets.items():
        evaluate_and_save_results_linear(dataset_name, X_train, X_test, y_train, y_test, method_label, results_folder)

**Lasso Regression**

In [None]:
# Define the folder to save all results
results_folder = 'lasso_results'
os.makedirs(results_folder, exist_ok=True)  # Create the folder if it does not exist

# Define the range of alphas to search over
alphas = np.logspace(-3, 1, 50)  # Adjust range as needed

# Initialize LassoCV with TimeSeriesSplit
lasso_cv = LassoCV(alphas=alphas, cv=tscv, max_iter=10000)

# Fit LassoCV on the entire training data to find the best alpha
lasso_cv.fit(X_train_scaled_df, y_train_data)  # Use the whole dataset for initial tuning

# Get the best alpha value
best_alpha = lasso_cv.alpha_
print(f"Optimal alpha: {best_alpha}")

# Assuming best_alpha is found from LassoCV with expanding window cross-validation
lasso_model = Lasso(alpha=best_alpha)

# Function to evaluate model, save results, and append to `results`
def evaluate_and_save_results_lasso(dataset_name, X_train, X_test, y_train, y_test, method_label):
    # Fit the model and make predictions
    lasso_model.fit(X_train, y_train)
    y_pred_train_values = lasso_model.predict(X_train)
    y_pred_test_values = lasso_model.predict(X_test)
    
    # Create DataFrames for train and test predictions with unique names
    y_pred_train_df = pd.DataFrame(y_pred_train_values, index=y_train.index, columns=[f'y_pred_{dataset_name}_train'])
    y_pred_test_df = pd.DataFrame(y_pred_test_values, index=y_test.index, columns=[f'y_pred_{dataset_name}_test'])
    
    # Calculate metrics
    r2_train = r2_score(y_train, y_pred_train_values)
    rmse_train = sqrt(mean_squared_error(y_train, y_pred_train_values))
    mae_train = mean_absolute_error(y_train, y_pred_train_values)

    r2_test = r2_score(y_test, y_pred_test_values)
    rmse_test = sqrt(mean_squared_error(y_test, y_pred_test_values))
    mae_test = mean_absolute_error(y_test, y_pred_test_values)
    
    # Append results to the already defined `results` list
    results.append({
        'Dataset': dataset_name,
        'Method': method_label,
        'Model': 'Lasso Regression',
        'R2_Train': r2_train,
        'RMSE_Train': rmse_train,
        'MAE_Train': mae_train,
        'R2_Test': r2_test,
        'RMSE_Test': rmse_test,
        'MAE_Test': mae_test
    })

    # Save the model using pickle with a filename that includes the dataset and method
    model_filename = os.path.join(results_folder, f'lasso_model_{method_label}_{dataset_name}.pkl')
    with open(model_filename, 'wb') as model_file:
        pickle.dump(lasso_model, model_file)

    # Plot actual vs predicted for both training and testing
    for data_type, y_true, y_pred_df, title_suffix in zip(
        ['train', 'test'],
        [y_train, y_test],
        [y_pred_train_df, y_pred_test_df],
        ['Training', 'Testing']
    ):
        plt.figure(figsize=(10, 6))
        plt.plot(y_true.index, y_true, label=f'Actual {title_suffix}', color='blue')
        plt.plot(y_true.index, y_pred_df.iloc[:, 0], label=f'Predicted {title_suffix}', color='red')
        plt.title(f'Lasso Regression - Actual vs Predicted Closing Price ({title_suffix}) - {dataset_name}')
        plt.xlabel('Data point')
        plt.ylabel('Closing Price')
        plt.legend()
        plot_filename = os.path.join(results_folder, f'actual_vs_predicted_{data_type}_Lasso_Regression_{dataset_name}.png')
        plt.savefig(plot_filename)
        plt.close()

# Loop through each dataset type and evaluate, storing specified predictions in dictionary
for method_label, (datasets, y_train, y_test) in datasets_dict.items():
    for dataset_name, (X_train, X_test) in datasets.items():
        evaluate_and_save_results_lasso(dataset_name, X_train, X_test, y_train, y_test, method_label)

**Random Walk**

In [None]:
# Set a random seed for reproducibility
np.random.seed(42)

# Define the folder to save all results
results_folder = 'random_walk_with_noise_results'
os.makedirs(results_folder, exist_ok=True)  # Create the folder if it does not exist

# Loop through each company dataset in datasets_dict
for company, (datasets, y_train, y_test) in datasets_dict.items():
    # Apply only to the specific company 'AAPL'
    if company == 'AAPL':
        # Training set: Predict using previous value with added Gaussian noise for the next day (except for the first value)
        noise_train = np.random.normal(0, y_train.std() * 0.01, len(y_train))  # Gaussian noise with mean 0 and small std deviation
        y_pred_rw_train = y_train.shift(1).fillna(method='bfill') + noise_train  # Add noise to the shifted values

        # Testing set: Predict using the previous day's actual value, adding Gaussian noise for each day
        y_pred_rw_test_list = []

        # Iterate over each day in the test set to generate predictions using true values from previous day
        for i in range(len(y_test)):
            previous_value = y_train.iloc[-1] if i == 0 else y_test.iloc[i - 1]  # Use last training value for the first day, then true value for subsequent
            noise_test = np.random.normal(0, y_train.std() * 0.01)  # Gaussian noise
            predicted_value = previous_value + noise_test
            y_pred_rw_test_list.append(predicted_value)

        # Convert the predictions to a Pandas Series
        y_pred_rw_test_series = pd.Series(y_pred_rw_test_list, index=y_test.index)

        # Evaluate metrics for both training and testing data
        r2_train = r2_score(y_train, y_pred_rw_train)
        rmse_train = mean_squared_error(y_train, y_pred_rw_train, squared=False)
        mae_train = mean_absolute_error(y_train, y_pred_rw_train)

        r2_test = r2_score(y_test, y_pred_rw_test_series)
        rmse_test = mean_squared_error(y_test, y_pred_rw_test_series, squared=False)
        mae_test = mean_absolute_error(y_test, y_pred_rw_test_series)

        # Append results to the list
        results.append({
            'Dataset': 'Close Price',
            'Method': f'{company} only',
            'Model': 'Random Walk with Noise',
            'R2_Train': r2_train,
            'RMSE_Train': rmse_train,
            'MAE_Train': mae_train,
            'R2_Test': r2_test,
            'RMSE_Test': rmse_test,
            'MAE_Test': mae_test
        })

        # Plot actual vs predicted closing prices for training data
        plt.figure(figsize=(10, 6))
        plt.plot(y_train.index, y_train, label='Actual Train', color='blue')
        plt.plot(y_train.index, y_pred_rw_train, label='Predicted Train', color='red')
        plt.title(f'Random Walk with Noise - Actual vs Predicted Closing Price (Training) - {company}')
        plt.xlabel('Date')
        plt.ylabel('Closing Price')
        plt.legend()
        plt.savefig(os.path.join(results_folder, f'actual_vs_predicted_train_{company}_RW_Noise_True_Close_Price.png'))
        plt.close()

        # Plot actual vs predicted closing prices for testing data
        plt.figure(figsize=(10, 6))
        plt.plot(y_test.index, y_test, label='Actual Test', color='blue')
        plt.plot(y_test.index, y_pred_rw_test_series, label='Predicted Test', color='red')
        plt.title(f'Random Walk with Noise - Actual vs Predicted Closing Price (Testing) - {company}')
        plt.xlabel('Date')
        plt.ylabel('Closing Price')
        plt.legend()
        plt.savefig(os.path.join(results_folder, f'actual_vs_predicted_test_{company}_RW_Noise_True_Close_Price.png'))
        plt.close()

**Simple Moving Average**

In [None]:
# Define the folder to save all results
results_folder = 'sma_results'
os.makedirs(results_folder, exist_ok=True)  # Create the folder if it does not exist

# List of companies to process
companies = ['AAPL']

# Dictionary to access each company's train and test data
unscaled_data = {
    'AAPL': (unscaled_X_train_close_aapl, unscaled_X_test_close_aapl, y_train_aapl, y_test_aapl)
}

# Loop over each company and perform SMA calculations
for company, (unscaled_X_train_close, unscaled_X_test_close, y_train, y_test) in unscaled_data.items():
    
    # Calculate SMA predictions for train and test data
    y_pred_sma_train = unscaled_X_train_close.mean(axis=1)
    y_pred_sma_test = unscaled_X_test_close.mean(axis=1)

    # Save predictions in DataFrames with appropriate names
    y_pred_train_df = pd.DataFrame(y_pred_sma_train, index=y_train.index, columns=[f'y_pred_{company}_train'])
    y_pred_test_df = pd.DataFrame(y_pred_sma_test, index=y_test.index, columns=[f'y_pred_{company}_test'])

    # Evaluate metrics for both training and testing data
    r2_train = r2_score(y_train, y_pred_sma_train)
    rmse_train = mean_squared_error(y_train, y_pred_sma_train, squared=False)
    mae_train = mean_absolute_error(y_train, y_pred_sma_train)

    r2_test = r2_score(y_test, y_pred_sma_test)
    rmse_test = mean_squared_error(y_test, y_pred_sma_test, squared=False)
    mae_test = mean_absolute_error(y_test, y_pred_sma_test)

    # Append results to the list
    results.append({
        'Dataset': 'Close Price',
        'Method': f'{company} only',
        'Model': 'SMA',
        'R2_Train': r2_train,
        'RMSE_Train': rmse_train,
        'MAE_Train': mae_train,
        'R2_Test': r2_test,
        'RMSE_Test': rmse_test,
        'MAE_Test': mae_test
    })

    # Plot actual vs predicted closing prices for training data
    plt.figure(figsize=(10, 6))
    plt.plot(y_train.index, y_train, label='Actual Train', color='blue')
    plt.plot(y_train.index, y_pred_sma_train, label='Predicted Train', color='red')
    plt.title(f'SMA - Actual vs Predicted Closing Price (Training) - {company} Close Price')
    plt.xlabel('Date')
    plt.ylabel('Closing Price')
    plt.legend()
    plt.savefig(os.path.join(results_folder, f'actual_vs_predicted_train_{company}_SMA_Close_Price.png'))
    plt.close()

    # Plot actual vs predicted closing prices for testing data
    plt.figure(figsize=(10, 6))
    plt.plot(y_test.index, y_test, label='Actual Test', color='blue')
    plt.plot(y_test.index, y_pred_sma_test, label='Predicted Test', color='red')
    plt.title(f'SMA - Actual vs Predicted Closing Price (Testing) - {company} Close Price')
    plt.xlabel('Date')
    plt.ylabel('Closing Price')
    plt.legend()
    plt.savefig(os.path.join(results_folder, f'actual_vs_predicted_test_{company}_SMA_Close_Price.png'))
    plt.close()

**ARIMA**

In [None]:
# Define the folder to save all results
results_folder = 'arima_results'
os.makedirs(results_folder, exist_ok=True)  # Create the folder if it does not exist

# Dictionary containing the training and test data for each firm
firms_data_arima = {
    'AAPL': (close_only_train_aapl, close_only_test_aapl)
}

# Loop over each firm
for firm, (close_only_train, close_only_test) in firms_data_arima.items():
    # Step 1: Find the best ARIMA parameters using auto_arima
    model = auto_arima(
        close_only_train,
        start_p=0, max_p=5,
        start_q=0, max_q=5,
        d=None,
        seasonal=False,
        trace=True,
        error_action='ignore',
        suppress_warnings=True,
        stepwise=True
    )
    p, d, q = model.order

    # Step 2: Fit a SARIMAX model with the best parameters on the initial training data
    arimamodel = SARIMAX(close_only_train, order=(p, d, q))
    arimamodel_fit = arimamodel.fit(disp=False)

    # Step 3: Initialize list to store rolling predictions
    rolling_forecast = []
    observed_values = close_only_train.copy()

    # Step 4: Generate rolling predictions without re-fitting the model
    for i, actual_value in enumerate(close_only_test):
        # Predict the next time step
        forecast = arimamodel_fit.get_forecast(steps=1).predicted_mean.iloc[0]
        rolling_forecast.append(forecast)
        
        # Add the actual observed value to observed_values
        new_observed = pd.Series([actual_value], index=[close_only_test.index[i]])
        observed_values = pd.concat([observed_values, new_observed])

        # Reinitialize the SARIMAX model with updated observed data, keeping fitted parameters
        arimamodel = SARIMAX(observed_values, order=(p, d, q))
        arimamodel_fit = arimamodel.filter(arimamodel_fit.params)

    # Convert rolling_forecast to a pandas Series for easy comparison
    rolling_forecast_series = pd.Series(rolling_forecast, index=close_only_test.index)

    # Step 5: Calculate R2, RMSE, and MAE for the test set
    r2_test = r2_score(close_only_test, rolling_forecast_series)
    rmse_test = mean_squared_error(close_only_test, rolling_forecast_series, squared=False)
    mae_test = mean_absolute_error(close_only_test, rolling_forecast_series)

    # Append results to the list
    results.append({
        'Dataset': f'{firm} Close Price',
        'Method': f'{firm} Only',
        'Model': 'ARIMA',
        'R2_Train': '-',  # No training R2 provided
        'RMSE_Train': '-',  # No training RMSE provided
        'MAE_Train': '-',  # No training MAE provided
        'R2_Test': r2_test,
        'RMSE_Test': rmse_test,
        'MAE_Test': mae_test
    })

    # Save the model with pickle
    model_filename = os.path.join(results_folder, f'arima_model_{firm}_only.pkl')
    with open(model_filename, 'wb') as model_file:
        pickle.dump(arimamodel_fit, model_file)

    # Plot actual vs rolling forecasted values
    plt.figure(figsize=(10, 6))
    plt.plot(close_only_train.index, close_only_train, label='Training Data')
    plt.plot(close_only_test.index, close_only_test, label='Actual Test Data')
    plt.plot(close_only_test.index, rolling_forecast_series, label='Rolling Forecasted Data', linestyle='--')
    plt.xlabel('Data Point')
    plt.ylabel('Close Price')
    plt.title(f'{firm} Actual vs Rolling Forecast ARIMA Predictions')
    plt.legend()
    plot_filename = os.path.join(results_folder, f'actual_vs_rolling_forecast_{firm}_ARIMA.png')
    plt.savefig(plot_filename)
    plt.close()

**XGBoost**

In [None]:
# Define the folder to save all results
results_folder = 'xgboost_results'
os.makedirs(results_folder, exist_ok=True)  # Create the folder if it does not exist

# Dictionary to store XGBoost model configurations based on hyperparameter tuning methods
xgboost_configs = {
    'XGBoost_RandomizedSearch': {
        'subsample': 1.0,
        'reg_lambda': 1,
        'reg_alpha': 0.1,
        'n_estimators': 150,
        'max_depth': 5,
        'learning_rate': 0.1,
        'gamma': 0,
        'colsample_bytree': 0.8
    },
    'XGBoost_Optuna': {
        'max_depth': 3,
        'min_child_weight': 2,
        'gamma': 0.4,
        'subsample': 1.0,
        'colsample_bytree': 0.4,
        'learning_rate': 0.17217120489054266,
        'n_estimators': 200,
        'alpha': 0.015033446295092711,
        'lambda': 2.4439050147224077
    }
}

# Function to evaluate model, save results, and append to `results`
def evaluate_and_save_results_xgboost(dataset_name, X_train, X_test, y_train, y_test, method_label, config_name, xgb_params):
    # Initialize the XGBoost model with specified parameters
    xgb_model = XGBRegressor(**xgb_params)
    
    # Fit the model and make predictions
    xgb_model.fit(X_train, y_train)
    y_pred_train_values = xgb_model.predict(X_train)
    y_pred_test_values = xgb_model.predict(X_test)
    
    # Calculate metrics
    r2_train = r2_score(y_train, y_pred_train_values)
    rmse_train = sqrt(mean_squared_error(y_train, y_pred_train_values))
    mae_train = mean_absolute_error(y_train, y_pred_train_values)

    r2_test = r2_score(y_test, y_pred_test_values)
    rmse_test = sqrt(mean_squared_error(y_test, y_pred_test_values))
    mae_test = mean_absolute_error(y_test, y_pred_test_values)
    
    # Append results to the already defined `results` list
    results.append({
        'Dataset': dataset_name,
        'Method': method_label,
        'Model': config_name,
        'R2_Train': r2_train,
        'RMSE_Train': rmse_train,
        'MAE_Train': mae_train,
        'R2_Test': r2_test,
        'RMSE_Test': rmse_test,
        'MAE_Test': mae_test
    })

    # Save the model using pickle with a filename that includes the dataset, method, and configuration name
    model_filename = os.path.join(results_folder, f'xgboost_model_{config_name}_{method_label}_{dataset_name}.pkl')
    with open(model_filename, 'wb') as model_file:
        pickle.dump(xgb_model, model_file)

    # Plot actual vs predicted for both training and testing
    for data_type, y_true, y_pred_values, title_suffix in zip(
        ['train', 'test'],
        [y_train, y_test],
        [y_pred_train_values, y_pred_test_values],
        ['Training', 'Testing']
    ):
        plt.figure(figsize=(10, 6))
        plt.plot(y_true.index, y_true, label=f'Actual {title_suffix}', color='blue')
        plt.plot(y_true.index, y_pred_values, label=f'Predicted {title_suffix}', color='red')
        plt.title(f'{config_name} - Actual vs Predicted Closing Price ({title_suffix}) - {dataset_name}')
        plt.xlabel('Data point')
        plt.ylabel('Closing Price')
        plt.legend()
        plot_filename = os.path.join(results_folder, f'actual_vs_predicted_{data_type}_{config_name}_{dataset_name}.png')
        plt.savefig(plot_filename)
        plt.close()

# Loop through each dataset type and evaluate for each XGBoost configuration
for method_label, (datasets, y_train, y_test) in datasets_dict.items():
    for dataset_name, (X_train, X_test) in datasets.items():
        for config_name, xgb_params in xgboost_configs.items():
            evaluate_and_save_results_xgboost(dataset_name, X_train, X_test, y_train, y_test, method_label, config_name, xgb_params)


**CatBoost**

In [None]:
# Define the folder to save all results
results_folder = 'catboost_results'
os.makedirs(results_folder, exist_ok=True)  # Create the folder if it does not exist

# Dictionary to store CatBoost model configurations based on hyperparameter tuning methods
catboost_configs = {
    'CatBoost_RandomizedSearch': {
        'subsample': 1.0,
        'learning_rate': 0.1,
        'l2_leaf_reg': 5,
        'iterations': 50,
        'depth': 5,
        'border_count': 128
    },
    'CatBoost_Optuna': {
        'subsample': 0.7199681031900936,
        'random_strength': 8,
        'random_seed': 55,
        'learning_rate': 0.02855751764303586,
        'l2_leaf_reg': 1.0878665873159017,
        'iterations': 1000,
        'depth': 3,
        'colsample_bylevel': 0.9462619361669599,
        'border_count': 203,
        'bagging_temperature': 0.8379990713830481
    }
}

# Function to evaluate model, save results, and append to `results`
def evaluate_and_save_results_catboost(dataset_name, X_train, X_test, y_train, y_test, method_label, config_name, cb_params):
    # Initialize the CatBoost model with specified parameters
    cb_model = CatBoostRegressor(**cb_params, verbose=0)  # Suppress output with verbose=0
    
    # Fit the model and make predictions
    cb_model.fit(X_train, y_train)
    y_pred_train_values = cb_model.predict(X_train)
    y_pred_test_values = cb_model.predict(X_test)
    
    # Calculate metrics
    r2_train = r2_score(y_train, y_pred_train_values)
    rmse_train = sqrt(mean_squared_error(y_train, y_pred_train_values))
    mae_train = mean_absolute_error(y_train, y_pred_train_values)

    r2_test = r2_score(y_test, y_pred_test_values)
    rmse_test = sqrt(mean_squared_error(y_test, y_pred_test_values))
    mae_test = mean_absolute_error(y_test, y_pred_test_values)
    
    # Append results to the already defined `results` list
    results.append({
        'Dataset': dataset_name,
        'Method': method_label,
        'Model': config_name,
        'R2_Train': r2_train,
        'RMSE_Train': rmse_train,
        'MAE_Train': mae_train,
        'R2_Test': r2_test,
        'RMSE_Test': rmse_test,
        'MAE_Test': mae_test
    })

    # Save the model using pickle with a filename that includes the dataset, method, and configuration name
    model_filename = os.path.join(results_folder, f'catboost_model_{config_name}_{method_label}_{dataset_name}.pkl')
    with open(model_filename, 'wb') as model_file:
        pickle.dump(cb_model, model_file)

    # Plot actual vs predicted for both training and testing
    for data_type, y_true, y_pred_values, title_suffix in zip(
        ['train', 'test'],
        [y_train, y_test],
        [y_pred_train_values, y_pred_test_values],
        ['Training', 'Testing']
    ):
        plt.figure(figsize=(10, 6))
        plt.plot(y_true.index, y_true, label=f'Actual {title_suffix}', color='blue')
        plt.plot(y_true.index, y_pred_values, label=f'Predicted {title_suffix}', color='red')
        plt.title(f'{config_name} - Actual vs Predicted Closing Price ({title_suffix}) - {dataset_name}')
        plt.xlabel('Data point')
        plt.ylabel('Closing Price')
        plt.legend()
        plot_filename = os.path.join(results_folder, f'actual_vs_predicted_{data_type}_{config_name}_{dataset_name}.png')
        plt.savefig(plot_filename)
        plt.close()

# Loop through each dataset type and evaluate for each CatBoost configuration
for method_label, (datasets, y_train, y_test) in datasets_dict.items():
    for dataset_name, (X_train, X_test) in datasets.items():
        for config_name, cb_params in catboost_configs.items():
            evaluate_and_save_results_catboost(dataset_name, X_train, X_test, y_train, y_test, method_label, config_name, cb_params)

**Random Forest**

In [None]:
# Define the folder to save all results
results_folder = 'random_forest_results'
os.makedirs(results_folder, exist_ok=True)  # Create the folder if it does not exist

# Dictionary to store Random Forest model configurations based on hyperparameter tuning methods
rf_configs = {
    'RandomForest_RandomizedSearch': {
        'n_estimators': 150,
        'min_samples_split': 10,
        'min_samples_leaf': 4,
        'max_features': None,
        'max_depth': 15,
        'bootstrap': False
    },
    'RandomForest_Optuna': {
        'n_estimators': 100,
        'min_samples_split': 4,
        'min_samples_leaf': 4,
        'max_features': None,
        'max_depth': 11,
        'bootstrap': False
    }
}

# Function to evaluate model, save results, and append to `results`
def evaluate_and_save_results_rf(dataset_name, X_train, X_test, y_train, y_test, method_label, config_name, rf_params):
    # Initialize the Random Forest model with specified parameters
    rf_model = RandomForestRegressor(**rf_params)
    
    # Fit the model and make predictions
    rf_model.fit(X_train, y_train)
    y_pred_train_values = rf_model.predict(X_train)
    y_pred_test_values = rf_model.predict(X_test)
    
    # Calculate metrics
    r2_train = r2_score(y_train, y_pred_train_values)
    rmse_train = sqrt(mean_squared_error(y_train, y_pred_train_values))
    mae_train = mean_absolute_error(y_train, y_pred_train_values)

    r2_test = r2_score(y_test, y_pred_test_values)
    rmse_test = sqrt(mean_squared_error(y_test, y_pred_test_values))
    mae_test = mean_absolute_error(y_test, y_pred_test_values)
    
    # Append results to the already defined `results` list
    results.append({
        'Dataset': dataset_name,
        'Method': method_label,
        'Model': config_name,
        'R2_Train': r2_train,
        'RMSE_Train': rmse_train,
        'MAE_Train': mae_train,
        'R2_Test': r2_test,
        'RMSE_Test': rmse_test,
        'MAE_Test': mae_test
    })

    # Save the model using pickle with a filename that includes the dataset, method, and configuration name
    model_filename = os.path.join(results_folder, f'rf_model_{config_name}_{method_label}_{dataset_name}.pkl')
    with open(model_filename, 'wb') as model_file:
        pickle.dump(rf_model, model_file)

    # Plot actual vs predicted for both training and testing
    for data_type, y_true, y_pred_values, title_suffix in zip(
        ['train', 'test'],
        [y_train, y_test],
        [y_pred_train_values, y_pred_test_values],
        ['Training', 'Testing']
    ):
        plt.figure(figsize=(10, 6))
        plt.plot(y_true.index, y_true, label=f'Actual {title_suffix}', color='blue')
        plt.plot(y_true.index, y_pred_values, label=f'Predicted {title_suffix}', color='red')
        plt.title(f'{config_name} - Actual vs Predicted Closing Price ({title_suffix}) - {dataset_name}')
        plt.xlabel('Data point')
        plt.ylabel('Closing Price')
        plt.legend()
        plot_filename = os.path.join(results_folder, f'actual_vs_predicted_{data_type}_{config_name}_{dataset_name}.png')
        plt.savefig(plot_filename)
        plt.close()

# Loop through each dataset type and evaluate for each Random Forest configuration
for method_label, (datasets, y_train, y_test) in datasets_dict.items():
    for dataset_name, (X_train, X_test) in datasets.items():
        for config_name, rf_params in rf_configs.items():
            evaluate_and_save_results_rf(dataset_name, X_train, X_test, y_train, y_test, method_label, config_name, rf_params)

In [None]:
results_df = pd.DataFrame(results)
results_df_sorted = results_df.sort_values(by="RMSE_Test", ascending=True)
print(results_df_sorted)
results_df_sorted.to_csv('sorted_results.csv', index=False)

**ANN using Random Grid Search**

In [None]:
results = []

# Updated ANN parameters dictionary for RGS
ANN_params_rgs = {
    'batch_size': 64,
    'epochs': 50,
    'layers': 5,
    'neurons': 16,
    'activation': 'relu',
    'learning_rate': 0.01,
    'dropout_rate': 0.5,
    'optimizer': 'adam'
}

# Function to build and compile the ANN model for RGS
def build_ANN_model_rgs(input_shape):
    model = Sequential()
    for _ in range(ANN_params_rgs['layers']):
        model.add(Dense(ANN_params_rgs['neurons'], activation=ANN_params_rgs['activation'], input_shape=(input_shape,)))
        model.add(Dropout(ANN_params_rgs['dropout_rate']))
    model.add(Dense(1))  # Output layer for regression

    optimizer = Adam(learning_rate=ANN_params_rgs['learning_rate']) if ANN_params_rgs['optimizer'] == 'adam' else SGD(learning_rate=ANN_params_rgs['learning_rate'])
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Function to train and evaluate ANN for each dataset using RGS hyperparameters
def evaluate_and_save_results_ANN_rgs(dataset_name, X_train, X_test, y_train, y_test, method_label):
    # Build and train the ANN model
    ann_model = build_ANN_model_rgs(X_train.shape[1])
    ann_model.fit(X_train, y_train, epochs=ANN_params_rgs['epochs'], batch_size=ANN_params_rgs['batch_size'], verbose=0)

    # Predict on train and test data
    y_pred_train = ann_model.predict(X_train).flatten()
    y_pred_test = ann_model.predict(X_test).flatten()

    # Calculate metrics
    r2_train = r2_score(y_train, y_pred_train)
    rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
    mae_train = mean_absolute_error(y_train, y_pred_train)

    r2_test = r2_score(y_test, y_pred_test)
    rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
    mae_test = mean_absolute_error(y_test, y_pred_test)

    # Append results to the already defined `results` list
    results.append({
        'Dataset': dataset_name,
        'Method': method_label,
        'Model': 'ANN Regression (RGS)',
        'R2_Train': r2_train,
        'RMSE_Train': rmse_train,
        'MAE_Train': mae_train,
        'R2_Test': r2_test,
        'RMSE_Test': rmse_test,
        'MAE_Test': mae_test
    })

    # Save the model using pickle
    model_filename = f'ann_model_rgs_{method_label}_{dataset_name}.pkl'
    with open(model_filename, 'wb') as model_file:
        pickle.dump(ann_model, model_file)

    # Plot actual vs predicted for both training and testing
    for data_type, y_true, y_pred, title_suffix in zip(
        ['train', 'test'],
        [y_train, y_test],
        [y_pred_train, y_pred_test],
        ['Training', 'Testing']
    ):
        plt.figure(figsize=(10, 6))
        plt.plot(y_true.index, y_true, label=f'Actual {title_suffix}', color='blue')
        plt.plot(y_true.index, y_pred, label=f'Predicted {title_suffix}', color='red')
        plt.title(f'ANN Regression (RGS) - Actual vs Predicted Closing Price ({title_suffix}) - {dataset_name}')
        plt.xlabel('Date')
        plt.ylabel('Closing Price')
        plt.legend()
        plt.savefig(f'actual_vs_predicted_{data_type}_ANN_Regression_RGS_{dataset_name}.png')
        plt.close()

# Loop through each dataset type and evaluate using RGS hyperparameters
for method_label, (datasets, y_train, y_test) in datasets_dict.items():
    for dataset_name, (X_train, X_test) in datasets.items():
        evaluate_and_save_results_ANN_rgs(dataset_name, X_train, X_test, y_train, y_test, method_label)

results_df = pd.DataFrame(results)
results_df.to_csv('ANN_Regression_RandomGridSearch_Results.csv', index=False)


**ANN Optuna**

In [None]:
results = []

# Updated ANN parameters dictionary for Optuna
ANN_params_optuna = {
    'batch_size': 64,
    'epochs': 44,
    'layers': 3,
    'neurons': 64,
    'activation': 'relu',
    'learning_rate': 0.00022424536912470238,
    'dropout_rate': 0.1562540817150868,
    'optimizer': 'adam'
}

# Function to build and compile the ANN model for Optuna
def build_ANN_model_optuna(input_shape):
    model = Sequential()
    for _ in range(ANN_params_optuna['layers']):
        model.add(Dense(ANN_params_optuna['neurons'], activation=ANN_params_optuna['activation'], input_shape=(input_shape,)))
        model.add(Dropout(ANN_params_optuna['dropout_rate']))
    model.add(Dense(1))  

    optimizer = Adam(learning_rate=ANN_params_optuna['learning_rate']) if ANN_params_optuna['optimizer'] == 'adam' else SGD(learning_rate=ANN_params_optuna['learning_rate'])
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Function to train and evaluate ANN for each dataset using Optuna hyperparameters
def evaluate_and_save_results_ANN_optuna(dataset_name, X_train, X_test, y_train, y_test, method_label):
    # Build and train the ANN model
    ann_model = build_ANN_model_optuna(X_train.shape[1])
    ann_model.fit(X_train, y_train, epochs=ANN_params_optuna['epochs'], batch_size=ANN_params_optuna['batch_size'], verbose=0)

    # Predict on train and test data
    y_pred_train = ann_model.predict(X_train).flatten()
    y_pred_test = ann_model.predict(X_test).flatten()

    # Calculate metrics
    r2_train = r2_score(y_train, y_pred_train)
    rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
    mae_train = mean_absolute_error(y_train, y_pred_train)

    r2_test = r2_score(y_test, y_pred_test)
    rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
    mae_test = mean_absolute_error(y_test, y_pred_test)

    # Append results to the already defined `results` list
    results.append({
        'Dataset': dataset_name,
        'Method': method_label,
        'Model': 'ANN Regression (Optuna)',
        'R2_Train': r2_train,
        'RMSE_Train': rmse_train,
        'MAE_Train': mae_train,
        'R2_Test': r2_test,
        'RMSE_Test': rmse_test,
        'MAE_Test': mae_test
    })

    # Save the model using pickle
    model_filename = f'ann_model_optuna_{method_label}_{dataset_name}.pkl'
    with open(model_filename, 'wb') as model_file:
        pickle.dump(ann_model, model_file)

    # Plot actual vs predicted for both training and testing
    for data_type, y_true, y_pred, title_suffix in zip(
        ['train', 'test'],
        [y_train, y_test],
        [y_pred_train, y_pred_test],
        ['Training', 'Testing']
    ):
        plt.figure(figsize=(10, 6))
        plt.plot(y_true.index, y_true, label=f'Actual {title_suffix}', color='blue')
        plt.plot(y_true.index, y_pred, label=f'Predicted {title_suffix}', color='red')
        plt.title(f'ANN Regression (Optuna) - Actual vs Predicted Closing Price ({title_suffix}) - {dataset_name}')
        plt.xlabel('Date')
        plt.ylabel('Closing Price')
        plt.legend()
        plt.savefig(f'actual_vs_predicted_{data_type}_ANN_Regression_Optuna_{dataset_name}.png')
        plt.close()

# Loop through each dataset type and evaluate using Optuna hyperparameters
for method_label, (datasets, y_train, y_test) in datasets_dict.items():
    for dataset_name, (X_train, X_test) in datasets.items():
        evaluate_and_save_results_ANN_optuna(dataset_name, X_train, X_test, y_train, y_test, method_label)

results_df = pd.DataFrame(results)
results_df.to_csv('ANN_Regression_Optuna_Results.csv', index=False)

**Prepare Data for LSTM, RNN, GRU**

In [None]:
# Load the full training and testing datasets
train_data = pd.read_csv("train_data_lstm.csv")
test_data = pd.read_csv("test_data_lstm.csv")

# Define the target and features
target_column = 'Close'
feature_columns = [col for col in train_data.columns if col not in ['Date', 'Symbol', 'GICS Sector', 'Headquarters Location', 'Founded']]

# Prepare sequences function with debugging to verify shapes
def prepare_sequences(data, sequence_length=5):
    X, y = [], []
    grouped_data = data.groupby('Symbol')
    
    for symbol, group in grouped_data:
        group = group.sort_values(by='Date')
        features = group[feature_columns].values
        target = group['Close'].values

        for i in range(len(features) - sequence_length):
            X.append(features[i:i + sequence_length])
            y.append(target[i + sequence_length])

    # Convert lists to numpy arrays and check shapes
    X, y = np.array(X), np.array(y)
    print(f"Prepared sequences: X shape = {X.shape}, y shape = {y.shape}")
    return X, y

def create_scaled_sequences(train_data, test_data, symbols=None, sequence_length=5):
    # Filter the data if symbols are provided
    if symbols is not None:
        train_data = train_data[train_data['Symbol'].isin(symbols)]
        test_data = test_data[test_data['Symbol'].isin(symbols)]

    # Prepare sequences
    X_train_raw, y_train_raw = prepare_sequences(train_data, sequence_length)
    X_test_raw, y_test_raw = prepare_sequences(test_data, sequence_length)

    # Check the shapes before scaling
    print(f"Raw shapes before scaling: X_train shape = {X_train_raw.shape}, y_train shape = {y_train_raw.shape}")
    print(f"Raw shapes before scaling: X_test shape = {X_test_raw.shape}, y_test shape = {y_test_raw.shape}")

    # Initialize scalers
    feature_scaler = MinMaxScaler()
    target_scaler = MinMaxScaler()

    # Scale features and target
    X_train = feature_scaler.fit_transform(X_train_raw.reshape(-1, X_train_raw.shape[-1])).reshape(X_train_raw.shape)
    X_test = feature_scaler.transform(X_test_raw.reshape(-1, X_test_raw.shape[-1])).reshape(X_test_raw.shape)
    y_train = target_scaler.fit_transform(y_train_raw.reshape(-1, 1)).flatten()
    y_test = target_scaler.transform(y_test_raw.reshape(-1, 1)).flatten()

    # Check the shapes after scaling
    print(f"Scaled shapes: X_train shape = {X_train.shape}, y_train shape = {y_train.shape}")
    print(f"Scaled shapes: X_test shape = {X_test.shape}, y_test shape = {y_test.shape}")

    return X_train, y_train, feature_scaler, target_scaler, X_test, y_test, feature_scaler, target_scaler

# Now proceed with creating datasets_dict and running your model training and evaluation code as before.

# Adjust calls to `create_scaled_sequences` to unpack into four items

X_train_all, y_train_all, feature_scaler_all, target_scaler_all, X_test_all, y_test_all, feature_scaler_test_all, target_scaler_test_all = create_scaled_sequences(train_data, test_data)

# Prepare datasets for some companies (first half of symbols)
half_symbols = train_data['Symbol'].unique()[:len(train_data['Symbol'].unique()) // 2]
X_train_some, y_train_some, feature_scaler_some, target_scaler_some, X_test_some, y_test_some, feature_scaler_test_some, target_scaler_test_some = create_scaled_sequences(train_data, test_data, symbols=half_symbols)

# Populate `datasets_dict` with properly unpacked values
datasets_dict = {
    'All_Company': (X_train_all, y_train_all, feature_scaler_all, target_scaler_all),
    'Some_Company': (X_train_some, y_train_some, feature_scaler_some, target_scaler_some)
}

for symbol in ['AAPL']:
    X_train, y_train, feature_scaler, target_scaler, X_test, y_test, feature_scaler_test, target_scaler_test = create_scaled_sequences(train_data, test_data, symbols=[symbol])
    datasets_dict[symbol] = (X_train, y_train, feature_scaler, target_scaler)
    datasets_dict[f"{symbol}_Test"] = (X_test, y_test, feature_scaler_test, target_scaler_test)


**LSTM**

In [None]:
# Define both sets of LSTM hyperparameters
lstm_params_rgs = {
    'optimizer': 'adam',
    'learning_rate': 0.005,
    'n_layers': 5,
    'n_units': 100,
    'dropout_rate': 0.2,
    'epochs': 20,
    'batch_size': 64
}

lstm_params_optuna = {
    'optimizer': 'adam',
    'learning_rate': 0.006431528679768236,
    'n_layers': 7,
    'n_units': 54,
    'dropout_rate': 0.24985937241800743,
    'epochs': 15,
    'batch_size': 128
}

# Define both sets of hyperparameters to iterate over
parameter_sets = {
    'RGS': lstm_params_rgs,
    'Optuna': lstm_params_optuna
}

# Function to build the LSTM model
def build_lstm_model(input_shape, params):
    model = Sequential()
    model.add(Input(shape=input_shape))
    for i in range(params['n_layers']):
        return_sequences = i < params['n_layers'] - 1
        model.add(LSTM(params['n_units'], return_sequences=return_sequences))
        model.add(Dropout(params['dropout_rate']))
    model.add(Dense(1))
    optimizer = Adam(learning_rate=params['learning_rate'])
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

# Initialize results list
results = []

# Function to evaluate and save LSTM results for each train-test-parameter set combination
def evaluate_and_save_results_lstm(train_name, test_name, X_train, X_test, y_train, y_test, target_scaler_train, target_scaler_test, param_label, params):
    input_shape = (X_train.shape[1], X_train.shape[2])
    lstm_model = build_lstm_model(input_shape, params)
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    # Train the model with the given parameters
    lstm_model.fit(
        X_train, y_train,
        epochs=params['epochs'],
        batch_size=params['batch_size'],
        validation_data=(X_test, y_test),
        callbacks=[early_stop],
        verbose=0
    )

    # Predict and calculate metrics on train data
    y_pred_train = target_scaler_train.inverse_transform(lstm_model.predict(X_train).flatten().reshape(-1, 1)).flatten()
    y_train_original = target_scaler_train.inverse_transform(y_train.reshape(-1, 1)).flatten()
    r2_train = r2_score(y_train_original, y_pred_train)
    rmse_train = np.sqrt(mean_squared_error(y_train_original, y_pred_train))
    mae_train = mean_absolute_error(y_train_original, y_pred_train)

    # Predict and calculate metrics on test data
    y_pred_test = target_scaler_test.inverse_transform(lstm_model.predict(X_test).flatten().reshape(-1, 1)).flatten()
    y_test_original = target_scaler_test.inverse_transform(y_test.reshape(-1, 1)).flatten()
    r2_test = r2_score(y_test_original, y_pred_test)
    rmse_test = np.sqrt(mean_squared_error(y_test_original, y_pred_test))
    mae_test = mean_absolute_error(y_test_original, y_pred_test)

    # Append results for this train-test-parameter set combination
    results.append({
        'Train_Dataset': train_name,
        'Test_Dataset': test_name,
        'Parameter_Set': param_label,
        'Model': 'LSTM',
        'R2_Train': r2_train,
        'RMSE_Train': rmse_train,
        'MAE_Train': mae_train,
        'R2_Test': r2_test,
        'RMSE_Test': rmse_test,
        'MAE_Test': mae_test
    })

    # Save model
    model_filename = f'lstm_model_{train_name}_trained_on_{test_name}_params_{param_label}.pkl'
    with open(model_filename, 'wb') as model_file:
        pickle.dump(lstm_model, model_file)

# Loop over each combination of train and test datasets, and parameter sets
for train_name, (X_train, y_train, feature_scaler_train, target_scaler_train) in datasets_dict.items():
    if "_Test" not in train_name:  # Skip test sets in the training loop
        for test_name, (X_test, y_test, feature_scaler_test, target_scaler_test) in datasets_dict.items():
            if "_Test" in test_name:  # Use only test sets in the testing loop
                # Evaluate LSTM model with both RGS and Optuna parameter sets
                for param_label, params in parameter_sets.items():
                    evaluate_and_save_results_lstm(
                        train_name, test_name, X_train, X_test, y_train, y_test,
                        target_scaler_train, target_scaler_test, param_label, params
                    )

# Convert results to DataFrame and save
results_df = pd.DataFrame(results)
results_df.to_csv('LSTM_Train_Test_Results_98_combinations.csv', index=False)


**RNN**

In [None]:
# Initialize results list to store evaluation metrics
results = []

# RNN model configurations based on different hyperparameter tuning methods
rnn_configs = {
    'RNN_Custom': {
        'optimizer': 'adam',
        'learning_rate': 0.005,
        'layers': 2,
        'hidden_units': 50,
        'grad_clip': 5,
        'dropout_rate': 0.1,
        'epochs': 50,
        'batch_size': 64
    },
    'RNN_Optuna': {
        'layers': 2,
        'hidden_units': 69,
        'dropout_rate': 0.06695188043530183,
        'learning_rate': 0.010354065744661594,
        'optimizer': 'adam',
        'grad_clip': 3,
        'batch_size': 32,
        'epochs': 24
    }
}

# Function to build the RNN model
def build_rnn_model(input_shape, params):
    model = Sequential()
    model.add(Input(shape=input_shape))

    for i in range(params['layers']):
        return_sequences = i < params['layers'] - 1
        model.add(SimpleRNN(params['hidden_units'], return_sequences=return_sequences, 
                            activation='tanh', 
                            kernel_constraint=tf.keras.constraints.max_norm(params['grad_clip'])))
        model.add(Dropout(params['dropout_rate']))

    model.add(Dense(1))  # Output layer for regression
    optimizer = Adam(learning_rate=params['learning_rate'])
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

# Prepare sequences function with only past 5-day features (exclude today's features)
def prepare_sequences(data, sequence_length=5):
    X, y = [], []
    grouped_data = data.groupby('Symbol')

    # Process each company's data separately to prevent mixing companies in sequences
    for symbol, group in grouped_data:
        group = group.sort_values(by='Date')  # Ensure time order
        feature_columns = [col for col in group.columns if col not in ['Date', 'Symbol', 'GICS Sector', 'Headquarters Location', 'Founded']]
        features = group[feature_columns].values
        target = group['Close'].values

        # Create sequences within the current company's data
        for i in range(len(features) - sequence_length):
            X.append(features[i:i + sequence_length])  # Past 5 days of features
            y.append(target[i + sequence_length])      # Target is the current day's close price after the 5-day sequence

    return np.array(X), np.array(y)

# Function to evaluate model, save results, and append to results
def evaluate_and_save_results_rnn(dataset_name, X_train, X_test, y_train, y_test, method_label, config_name, rnn_params):
    # Initialize and build the RNN model
    input_shape = (X_train.shape[1], X_train.shape[2])
    rnn_model = build_rnn_model(input_shape, rnn_params)

    # Early stopping to prevent overfitting
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model
    history = rnn_model.fit(
        X_train, y_train,
        epochs=rnn_params['epochs'],
        batch_size=rnn_params['batch_size'],
        validation_data=(X_test, y_test),
        callbacks=[early_stop],
        verbose=0
    )

    # Predict on train and test data and apply inverse transformation
    y_pred_train = target_scaler.inverse_transform(rnn_model.predict(X_train).flatten().reshape(-1, 1)).flatten()
    y_pred_test = target_scaler.inverse_transform(rnn_model.predict(X_test).flatten().reshape(-1, 1)).flatten()

    # Inverse transform y_train and y_test for accurate plotting
    y_train_original = target_scaler.inverse_transform(y_train.reshape(-1, 1)).flatten()
    y_test_original = target_scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()

    # Calculate metrics
    r2_train = r2_score(y_train_original, y_pred_train)
    rmse_train = np.sqrt(mean_squared_error(y_train_original, y_pred_train))
    mae_train = mean_absolute_error(y_train_original, y_pred_train)

    r2_test = r2_score(y_test_original, y_pred_test)
    rmse_test = np.sqrt(mean_squared_error(y_test_original, y_pred_test))
    mae_test = mean_absolute_error(y_test_original, y_pred_test)

    # Append results to the already defined results list
    results.append({
        'Dataset': dataset_name,
        'Method': method_label,
        'Model': config_name,
        'R2_Train': r2_train,
        'RMSE_Train': rmse_train,
        'MAE_Train': mae_train,
        'R2_Test': r2_test,
        'RMSE_Test': rmse_test,
        'MAE_Test': mae_test
    })

    # Save the model using pickle
    model_filename = f'rnn_model_{config_name}_{method_label}_{dataset_name}.pkl'
    with open(model_filename, 'wb') as model_file:
        pickle.dump(rnn_model, model_file)

    # Plot actual vs predicted for both training and testing
    for data_type, y_true, y_pred, title_suffix in zip(
        ['train', 'test'],
        [y_train_original, y_test_original],
        [y_pred_train, y_pred_test],
        ['Training', 'Testing']
    ):
        plt.figure(figsize=(10, 6))
        plt.plot(range(len(y_true)), y_true, label=f'Actual {title_suffix}', color='blue')
        plt.plot(range(len(y_pred)), y_pred, label=f'Predicted {title_suffix}', color='red')
        plt.title(f'{config_name} - Actual vs Predicted Closing Price ({title_suffix}) - {dataset_name}')
        plt.xlabel('Samples')
        plt.ylabel('Closing Price')
        plt.legend()
        plt.savefig(f'actual_vs_predicted_{data_type}_{config_name}_{dataset_name}.png')
        plt.close()

# Loop through each dataset type and evaluate for each RNN configuration
for method_label, (datasets, y_train, y_test) in datasets_dict.items():
    for dataset_name, (X_train, X_test) in datasets.items():
        for config_name, rnn_params in rnn_configs.items():
            evaluate_and_save_results_rnn(dataset_name, X_train, X_test, y_train, y_test, method_label, config_name, rnn_params)

# Convert results to DataFrame and save
results_df = pd.DataFrame(results)
results_df.to_csv('RNN_Results.csv', index=False)


**GRU**

In [None]:
# Initialize results list to store evaluation metrics
results = []

# GRU model configurations based on different hyperparameter tuning methods
gru_configs = {
    'GRU_RandomizedSearch': {
        'optimizer': 'adam',
        'learning_rate': 0.02,
        'layers': 1,
        'hidden_units': 100,
        'grad_clip': 5,
        'dropout_rate': 0.2,
        'epochs': 10,
        'batch_size': 64
    },
    'GRU_Optuna': {
        'layers': 1,
        'hidden_units': 116,
        'dropout_rate': 0.24446919327565325,
        'learning_rate': 0.024719470804780756,
        'optimizer': 'adam',
        'grad_clip': 5,
        'batch_size': 32,
        'epochs': 10
    }
}

# Function to build the GRU model
def build_gru_model(input_shape, params):
    model = Sequential()
    model.add(Input(shape=input_shape))

    for i in range(params['layers']):
        return_sequences = i < params['layers'] - 1
        model.add(GRU(params['hidden_units'], return_sequences=return_sequences, 
                      activation='tanh', 
                      kernel_constraint=tf.keras.constraints.max_norm(params['grad_clip'])))
        model.add(Dropout(params['dropout_rate']))

    model.add(Dense(1))  # Output layer for regression
    optimizer = Adam(learning_rate=params['learning_rate'])
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

# Prepare sequences function with only past 5-day features (exclude today's features)
def prepare_sequences(data, sequence_length=5):
    X, y = [], []
    grouped_data = data.groupby('Symbol')

    # Process each company's data separately to prevent mixing companies in sequences
    for symbol, group in grouped_data:
        group = group.sort_values(by='Date')  # Ensure time order
        feature_columns = [col for col in group.columns if col not in ['Date', 'Symbol', 'GICS Sector', 'Headquarters Location', 'Founded']]
        features = group[feature_columns].values
        target = group['Close'].values

        # Create sequences within the current company's data
        for i in range(len(features) - sequence_length):
            X.append(features[i:i + sequence_length])  # Past 5 days of features
            y.append(target[i + sequence_length])      # Target is the current day's close price after the 5-day sequence

    return np.array(X), np.array(y)

# Function to evaluate model, save results, and append to results
def evaluate_and_save_results_gru(dataset_name, X_train, X_test, y_train, y_test, method_label, config_name, gru_params):
    # Initialize and build the GRU model
    input_shape = (X_train.shape[1], X_train.shape[2])
    gru_model = build_gru_model(input_shape, gru_params)

    # Early stopping to prevent overfitting
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model
    history = gru_model.fit(
        X_train, y_train,
        epochs=gru_params['epochs'],
        batch_size=gru_params['batch_size'],
        validation_data=(X_test, y_test),
        callbacks=[early_stop],
        verbose=0
    )

    # Predict on train and test data and apply inverse transformation
    y_pred_train = target_scaler.inverse_transform(gru_model.predict(X_train).flatten().reshape(-1, 1)).flatten()
    y_pred_test = target_scaler.inverse_transform(gru_model.predict(X_test).flatten().reshape(-1, 1)).flatten()

    # Inverse transform y_train and y_test for accurate plotting
    y_train_original = target_scaler.inverse_transform(y_train.reshape(-1, 1)).flatten()
    y_test_original = target_scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()

    # Calculate metrics
    r2_train = r2_score(y_train_original, y_pred_train)
    rmse_train = np.sqrt(mean_squared_error(y_train_original, y_pred_train))
    mae_train = mean_absolute_error(y_train_original, y_pred_train)

    r2_test = r2_score(y_test_original, y_pred_test)
    rmse_test = np.sqrt(mean_squared_error(y_test_original, y_pred_test))
    mae_test = mean_absolute_error(y_test_original, y_pred_test)

    # Append results to the already defined results list
    results.append({
        'Dataset': dataset_name,
        'Method': method_label,
        'Model': config_name,
        'R2_Train': r2_train,
        'RMSE_Train': rmse_train,
        'MAE_Train': mae_train,
        'R2_Test': r2_test,
        'RMSE_Test': rmse_test,
        'MAE_Test': mae_test
    })

    # Save the model using pickle
    model_filename = f'gru_model_{config_name}_{method_label}_{dataset_name}.pkl'
    with open(model_filename, 'wb') as model_file:
        pickle.dump(gru_model, model_file)

    # Plot actual vs predicted for both training and testing
    for data_type, y_true, y_pred, title_suffix in zip(
        ['train', 'test'],
        [y_train_original, y_test_original],
        [y_pred_train, y_pred_test],
        ['Training', 'Testing']
    ):
        plt.figure(figsize=(10, 6))
        plt.plot(range(len(y_true)), y_true, label=f'Actual {title_suffix}', color='blue')
        plt.plot(range(len(y_pred)), y_pred, label=f'Predicted {title_suffix}', color='red')
        plt.title(f'{config_name} - Actual vs Predicted Closing Price ({title_suffix}) - {dataset_name}')
        plt.xlabel('Samples')
        plt.ylabel('Closing Price')
        plt.legend()
        plt.savefig(f'actual_vs_predicted_{data_type}_{config_name}_{dataset_name}.png')
        plt.close()

# Loop through each dataset type and evaluate for each GRU configuration
for method_label, (datasets, y_train, y_test) in datasets_dict.items():
    for dataset_name, (X_train, X_test) in datasets.items():
        for config_name, gru_params in gru_configs.items():
            evaluate_and_save_results_gru(dataset_name, X_train, X_test, y_train, y_test, method_label, config_name, gru_params)

# Convert results to DataFrame and save
results_df = pd.DataFrame(results)
results_df.to_csv('GRU_Results.csv', index=False)


**Plots for LSTM**

In [106]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle

# Load the results DataFrame
results_df = pd.read_csv('LSTM_Train_Test_Results_98_combinations.csv')

# Define a function to create and save plots for actual vs. predicted values
def plot_actual_vs_predicted(y_actual, y_pred, data_type, param_label, train_name, test_name):
    plt.figure(figsize=(10, 6))
    plt.plot(y_actual, label='Actual', color='blue')
    plt.plot(y_pred, label='Predicted', color='red')
    plt.title(f'{param_label} - {data_type} Set - {train_name} trained on {test_name}')
    plt.xlabel('Samples')
    plt.ylabel('Closing Price')
    plt.legend()
    filename = f'actual_vs_predicted_{data_type}_{param_label}_{train_name}_trained_on_{test_name}.png'
    plt.savefig(filename)
    plt.close()

# Loop through each row in results to plot both training and testing data predictions
for idx, row in results_df.iterrows():
    train_name = row['Train_Dataset']
    test_name = row['Test_Dataset']
    param_label = row['Parameter_Set']

    # Load the corresponding model file
    model_filename = f'lstm_model_{train_name}_trained_on_{test_name}_params_{param_label}.pkl'
    with open(model_filename, 'rb') as file:
        lstm_model = pickle.load(file)

    # Use the trained scalers and sequences to plot predictions
    for data_type, X, y, target_scaler in [('train', X_train, y_train, target_scaler_train), 
                                           ('test', X_test, y_test, target_scaler_test)]:
        # Make predictions and inverse transform for actual values
        y_pred = target_scaler.inverse_transform(lstm_model.predict(X).flatten().reshape(-1, 1)).flatten()
        y_actual = target_scaler.inverse_transform(y.reshape(-1, 1)).flatten()
        
        # Generate and save the plot
        plot_actual_vs_predicted(y_actual, y_pred, data_type, param_label, train_name, test_name)

print("All plots have been generated and saved.")


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 810ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1s/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 733ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 802ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1s/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 775ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms

**Plots for GRU**

In [131]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import os

# Load the GRU results DataFrame
gru_results_df = pd.read_csv('GRU_Train_Test_Results_98_combinations.csv')

# Define a function to create and save plots for actual vs. predicted values for GRU
def plot_actual_vs_predicted(y_actual, y_pred, data_type, param_label, train_name, test_name):
    plt.figure(figsize=(10, 6))
    plt.plot(y_actual, label='Actual', color='blue')
    plt.plot(y_pred, label='Predicted', color='red')
    plt.title(f'{param_label} - {data_type} Set - {train_name} trained on {test_name}')
    plt.xlabel('Samples')
    plt.ylabel('Closing Price')
    plt.legend()
    filename = f'GRU_actual_vs_predicted_{data_type}_{param_label}_{train_name}_trained_on_{test_name}.png'
    plt.savefig(filename)
    plt.close()

# Loop through each row in results to plot both training and testing data predictions
for idx, row in gru_results_df.iterrows():
    train_name = row['Train_Dataset']
    test_name = row['Test_Dataset']
    param_label = row['Parameter_Set']

    # Construct the model filename
    model_filename = f'gru_model_{train_name}_trained_on_{test_name}_params_{param_label}.pkl'
    
    # Check if the model file exists
    if not os.path.exists(model_filename):
        print(f"Model file not found: {model_filename}. Skipping this combination.")
        continue  # Skip to the next row if the file is missing

    # Load the GRU model
    with open(model_filename, 'rb') as file:
        gru_model = pickle.load(file)

    # Assume `X_train`, `y_train`, `X_test`, `y_test`, `target_scaler_train`, and `target_scaler_test` are defined
    for data_type, X, y, target_scaler in [('train', X_train, y_train, target_scaler_train), 
                                           ('test', X_test, y_test, target_scaler_test)]:
        # Make predictions and inverse transform for actual values
        y_pred = target_scaler.inverse_transform(gru_model.predict(X).flatten().reshape(-1, 1)).flatten()
        y_actual = target_scaler.inverse_transform(y.reshape(-1, 1)).flatten()
        
        # Generate and save the plot
        plot_actual_vs_predicted(y_actual, y_pred, data_type, param_label, train_name, test_name)

print("Plot generation complete.")


Model file not found: gru_model_All_Company_trained_on_AAPL_Test_params_Optuna.pkl. Skipping this combination.
Model file not found: gru_model_All_Company_trained_on_AAPL_Test_params_RGS.pkl. Skipping this combination.
Model file not found: gru_model_All_Company_trained_on_FOX_Test_params_Optuna.pkl. Skipping this combination.
Model file not found: gru_model_All_Company_trained_on_FOX_Test_params_RGS.pkl. Skipping this combination.
Model file not found: gru_model_All_Company_trained_on_IBM_Test_params_Optuna.pkl. Skipping this combination.
Model file not found: gru_model_All_Company_trained_on_IBM_Test_params_RGS.pkl. Skipping this combination.
Model file not found: gru_model_All_Company_trained_on_NOW_Test_params_Optuna.pkl. Skipping this combination.
Model file not found: gru_model_All_Company_trained_on_NOW_Test_params_RGS.pkl. Skipping this combination.
Model file not found: gru_model_All_Company_trained_on_REG_Test_params_Optuna.pkl. Skipping this combination.
[1m3/3[0m [32m━━━

**Plots for RNN**


In [111]:
# Load the RNN results DataFrame
rnn_results_df = pd.read_csv('RNN_Train_Test_Results_98_combinations.csv')

# Define a function to create and save plots for actual vs. predicted values for RNN
def plot_actual_vs_predicted(y_actual, y_pred, data_type, param_label, train_name, test_name):
    plt.figure(figsize=(10, 6))
    plt.plot(y_actual, label='Actual', color='blue')
    plt.plot(y_pred, label='Predicted', color='red')
    plt.title(f'{param_label} - {data_type} Set - {train_name} trained on {test_name}')
    plt.xlabel('Samples')
    plt.ylabel('Closing Price')
    plt.legend()
    filename = f'RNN_actual_vs_predicted_{data_type}_{param_label}_{train_name}_trained_on_{test_name}.png'
    plt.savefig(filename)
    plt.close()

# Loop through each row in results to plot both training and testing data predictions
for idx, row in rnn_results_df.iterrows():
    train_name = row['Train_Dataset']
    test_name = row['Test_Dataset']
    param_label = row['Parameter_Set']

    # Load the corresponding RNN model file
    model_filename = f'rnn_model_{train_name}_trained_on_{test_name}_params_{param_label}.pkl'
    with open(model_filename, 'rb') as file:
        rnn_model = pickle.load(file)

    # Use the trained scalers and sequences to plot predictions
    for data_type, X, y, target_scaler in [('train', X_train, y_train, target_scaler_train), 
                                           ('test', X_test, y_test, target_scaler_test)]:
        # Make predictions and inverse transform for actual values
        y_pred = target_scaler.inverse_transform(rnn_model.predict(X).flatten().reshape(-1, 1)).flatten()
        y_actual = target_scaler.inverse_transform(y.reshape(-1, 1)).flatten()
        
        # Generate and save the plot
        plot_actual_vs_predicted(y_actual, y_pred, data_type, param_label, train_name, test_name)

print("All RNN plots have been generated and saved.")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 361ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 243ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 357ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 233ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 418ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 283ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 356ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s