## Random forest


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the dataset
file_path = '.../allinone_adjusted_final.xlsx'
data = pd.read_excel(file_path)

# Select four available hubs for the prediction task
hubs = ['MidC', 'Palo Verde Peak', 'Indiana', 'SP15 EZ Gen DA LMP Peak']
data_filtered = data[data['Price hub'].isin(hubs)]

# Group by 'Trade date' and 'Price hub' and take the mean of 'price'
data_grouped = data_filtered.groupby(['Trade date', 'Price hub']).agg({'price': 'mean'}).reset_index()

# Pivot the data to have each hub's features as columns
data_pivoted = data_grouped.pivot(index='Trade date', columns='Price hub', values='price')

# Drop rows with missing values (if any)
data_pivoted.dropna(inplace=True)

# Add time-based features
data_pivoted['day_of_week'] = data_pivoted.index.dayofweek
data_pivoted['day_of_month'] = data_pivoted.index.day
data_pivoted['month'] = data_pivoted.index.month
data_pivoted['quarter'] = data_pivoted.index.quarter

# Add lagged features and rolling statistics
for hub in hubs:
    data_pivoted[f'{hub}_lag1'] = data_pivoted[hub].shift(1)
    data_pivoted[f'{hub}_ma7'] = data_pivoted[hub].rolling(window=7).mean()

# Drop rows with NaN values after adding lagged features
data_pivoted.dropna(inplace=True)

# Define features and target
X = data_pivoted.drop(columns=['SP15 EZ Gen DA LMP Peak'])
y = data_pivoted['SP15 EZ Gen DA LMP Peak']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data (optional for Random Forest, but we will do it for consistency)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the Random Forest model
rf = RandomForestRegressor(random_state=42)

# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Use GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Get the best model from grid search
best_rf = grid_search.best_estimator_

# Evaluate the best model
predictions = best_rf.predict(X_test_scaled)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')


Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best Parameters: {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Mean Squared Error: 25.841363893944475
Root Mean Squared Error: 5.083440163309142


## Automated Hyperparameter Tuning for Deep Neural Networks

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the dataset
file_path = '.../allinone_adjusted_final.xlsx'
data = pd.read_excel(file_path)

# Select four available hubs for the prediction task
hubs = ['MidC', 'Palo Verde Peak', 'Indiana', 'SP15 EZ Gen DA LMP Peak']
data_filtered = data[data['Price hub'].isin(hubs)]

# Group by 'Trade date' and 'Price hub' and take the mean of 'price'
data_grouped = data_filtered.groupby(['Trade date', 'Price hub']).agg({'price': 'mean'}).reset_index()

# Pivot the data to have each hub's features as columns
data_pivoted = data_grouped.pivot(index='Trade date', columns='Price hub', values='price')

# Drop rows with missing values (if any)
data_pivoted.dropna(inplace=True)

# Add time-based features
data_pivoted['day_of_week'] = data_pivoted.index.dayofweek
data_pivoted['day_of_month'] = data_pivoted.index.day
data_pivoted['month'] = data_pivoted.index.month
data_pivoted['quarter'] = data_pivoted.index.quarter

# Add lagged features and rolling statistics
for hub in hubs:
    data_pivoted[f'{hub}_lag1'] = data_pivoted[hub].shift(1)
    data_pivoted[f'{hub}_ma7'] = data_pivoted[hub].rolling(window=7).mean()

# Drop rows with NaN values after adding lagged features
data_pivoted.dropna(inplace=True)

# Define features and target
X = data_pivoted.drop(columns=['SP15 EZ Gen DA LMP Peak'])
y = data_pivoted['SP15 EZ Gen DA LMP Peak']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the model
def build_model(optimizer='adam', dropout_rate=0.2, neurons=128):
    model = Sequential([
        Dense(neurons, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        Dropout(dropout_rate),
        Dense(neurons, activation='relu'),
        Dropout(dropout_rate),
        Dense(1)  # Output layer for regression
    ])
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

# Define hyperparameters to tune
batch_sizes = [32, 64, 128]
epochs_list = [50, 100, 150]
optimizers = ['adam', 'rmsprop']
dropout_rates = [0.2, 0.3, 0.4]
neuron_sizes = [64, 128, 256]

best_model = None
best_mse = float('inf')
best_params = {}

# Manual hyperparameter tuning
for batch_size in batch_sizes:
    for epochs in epochs_list:
        for optimizer in optimizers:
            for dropout_rate in dropout_rates:
                for neurons in neuron_sizes:
                    model = build_model(optimizer=optimizer, dropout_rate=dropout_rate, neurons=neurons)
                    model.fit(X_train_scaled, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0)
                    predictions = model.predict(X_test_scaled)
                    mse = mean_squared_error(y_test, predictions)
                    if mse < best_mse:
                        best_mse = mse
                        best_model = model
                        best_params = {
                            'batch_size': batch_size,
                            'epochs': epochs,
                            'optimizer': optimizer,
                            'dropout_rate': dropout_rate,
                            'neurons': neurons
                        }

# Display the best parameters
print(f"Best parameters: {best_params}")

# Evaluate the best model
loss, mae = best_model.evaluate(X_test_scaled, y_test)
print(f'Mean Absolute Error: {mae}')

# Predict
predictions = best_model.predict(X_test_scaled)

# Calculate MSE and RMSE
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')


Best parameters: {'batch_size': 32, 'epochs': 150, 'optimizer': 'adam', 'dropout_rate': 0.2, 'neurons': 256}
Mean Absolute Error: 3.223870038986206
Mean Squared Error: 22.602132612971293
Root Mean Squared Error: 4.754170023565764


## Automated Hyperparameter Tuning for LSTM Neural Networks


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the dataset
file_path = '.../allinone_adjusted_final.xlsx'
data = pd.read_excel(file_path)

# Select four available hubs for the prediction task
hubs = ['MidC', 'Palo Verde Peak', 'Indiana', 'SP15 EZ Gen DA LMP Peak']
data_filtered = data[data['Price hub'].isin(hubs)]

# Group by 'Trade date' and 'Price hub' and take the mean of 'price'
data_grouped = data_filtered.groupby(['Trade date', 'Price hub']).agg({'price': 'mean'}).reset_index()

# Pivot the data to have each hub's features as columns
data_pivoted = data_grouped.pivot(index='Trade date', columns='Price hub', values='price')

# Drop rows with missing values (if any)
data_pivoted.dropna(inplace=True)

# Add time-based features
data_pivoted['day_of_week'] = data_pivoted.index.dayofweek
data_pivoted['day_of_month'] = data_pivoted.index.day
data_pivoted['month'] = data_pivoted.index.month
data_pivoted['quarter'] = data_pivoted.index.quarter

# Add lagged features and rolling statistics
for hub in hubs:
    data_pivoted[f'{hub}_lag1'] = data_pivoted[hub].shift(1)
    data_pivoted[f'{hub}_ma7'] = data_pivoted[hub].rolling(window=7).mean()

# Drop rows with NaN values after adding lagged features
data_pivoted.dropna(inplace=True)

# Define features and target
X = data_pivoted.drop(columns=['SP15 EZ Gen DA LMP Peak'])
y = data_pivoted['SP15 EZ Gen DA LMP Peak']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for LSTM [samples, timesteps, features]
X_train_scaled = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_scaled = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

# Define the LSTM model
def build_lstm_model(optimizer='adam', dropout_rate=0.2, neurons=50):
    model = Sequential([
        LSTM(neurons, activation='relu', input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2])),
        Dropout(dropout_rate),
        Dense(1)  # Output layer for regression
    ])
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

# Define hyperparameters to tune
batch_sizes = [32, 64, 128]
epochs_list = [50, 100, 150]
optimizers = ['adam', 'rmsprop']
dropout_rates = [0.2, 0.3, 0.4]
neuron_sizes = [50, 100, 150]

best_model = None
best_mse = float('inf')
best_params = {}

# Manual hyperparameter tuning
for batch_size in batch_sizes:
    for epochs in epochs_list:
        for optimizer in optimizers:
            for dropout_rate in dropout_rates:
                for neurons in neuron_sizes:
                    model = build_lstm_model(optimizer=optimizer, dropout_rate=dropout_rate, neurons=neurons)
                    model.fit(X_train_scaled, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0)
                    predictions = model.predict(X_test_scaled)
                    mse = mean_squared_error(y_test, predictions)
                    if mse < best_mse:
                        best_mse = mse
                        best_model = model
                        best_params = {
                            'batch_size': batch_size,
                            'epochs': epochs,
                            'optimizer': optimizer,
                            'dropout_rate': dropout_rate,
                            'neurons': neurons
                        }

# Display the best parameters
print(f"Best parameters: {best_params}")

# Evaluate the best model
loss, mae = best_model.evaluate(X_test_scaled, y_test)
print(f'Mean Absolute Error: {mae}')

# Predict
predictions = best_model.predict(X_test_scaled)

# Calculate MSE and RMSE
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')


Best parameters: {'batch_size': 32, 'epochs': 150, 'optimizer': 'rmsprop', 'dropout_rate': 0.2, 'neurons': 100}
Mean Absolute Error: 4.129806041717529
Mean Squared Error: 42.88210866803312
Root Mean Squared Error: 6.5484432247697715
