In [None]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.11.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.11.0


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from keras.layers import Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
import tensorflow as tf
import random

random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
merged_n = pd.read_csv(r'/content/drive/MyDrive/Dissertation_10862121/merged_n.csv')

In [None]:
save_dir = "r'/content/drive/MyDrive/Dissertation_10862121/"

In [None]:
import os
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [None]:
merged_n['GMT'] = pd.to_datetime(merged_n['GMT'])
merged_n['Date'] = pd.to_datetime(merged_n['Date'])
merged_n.set_index('GMT', inplace=True)

Feature Selection

In [None]:
merged_n = merged_n.drop(columns=["Hourly Mean Windspeed (kn)"])

Feature Engineering

Lags

Consumption

In [None]:
#Lag for the hour before
merged_n['lag_1_hour_energy'] = merged_n['Total Household Consumption (N)'].shift(1)

#Lag for the same hour of the day before
merged_n['lag_1_day_energy'] = merged_n['Total Household Consumption (N)'].shift(24)

#Lag for the same hour of the same day of the week before (7 days before)
merged_n['lag_7_day_energy'] = merged_n['Total Household Consumption (N)'].shift(24 * 7)

Temperature

In [None]:
#Lag for the hour before
merged_n['lag_1_hour_temp'] = merged_n['Hourly Temperature (C)'].shift(1)
merged_n['lag_1_hour_temp'] = merged_n['Hourly Temperature (C)'].shift(24)
merged_n['lag_1_hour_temp'] = merged_n['Hourly Temperature (C)'].shift(24*7)

Radiation

In [None]:
#Lag for the hour before
merged_n['lag_1_hour_rad'] = merged_n['Hourly Global Radiation (KJ/m2)'].shift(1)
merged_n['lag_1_hour_rad'] = merged_n['Hourly Global Radiation (KJ/m2)'].shift(24)
merged_n['lag_1_hour_rad'] = merged_n['Hourly Global Radiation (KJ/m2)'].shift(24*7)

Rolling Statistics - Consumption

In [None]:
window_size = 72
merged_n['rolling_mean_consumption'] = merged_n['Total Household Consumption (N)'].rolling(window=window_size).mean()

Cyclical Features

In [None]:
merged_n['hour_sin'] = np.sin(2 * np.pi * merged_n['Hour'] / 24)
merged_n['hour_cos'] = np.cos(2 * np.pi * merged_n['Hour'] / 24)

Weather Combination Variables

In [None]:
merged_n['temp_humidity_interaction'] = merged_n['Hourly Temperature (C)'] * merged_n['Hourly Relative Humidity (%)']

In [None]:
merged_n['temp_rad_interaction'] = merged_n['Hourly Temperature (C)'] * merged_n['Hourly Global Radiation (KJ/m2)']

Weighted Observations (More weight to more recent)

In [None]:
alpha = 0.9
merged_n['ewm_consumption'] = merged_n['Total Household Consumption (N)'].ewm(alpha=alpha).mean()

Difference between consecutive hours

In [None]:
merged_n['consumption_diff'] = merged_n['Total Household Consumption (N)'].diff()

Dropping Missing Value Rows after Feature Engineering

In [None]:
merged_n.dropna(inplace=True)

In [None]:
merged_n['IsHoliday'] = merged_n['IsHoliday'].astype(int)

In [None]:
non_num_columns = merged_n.select_dtypes(exclude=['int64', 'float64']).columns
print(non_num_columns)

Index(['Date', 'Time'], dtype='object')


In [None]:
#Dropping columns that aren't numerical (needed for model input)
merged_n= merged_n.drop(columns=['Date', 'Time'])

In [None]:
non_num_columns = merged_n.select_dtypes(exclude=['int64', 'float64']).columns
print(non_num_columns)

Index([], dtype='object')


Standard Scaler

In [None]:
#Selecting numerical columns
numerical_cols = merged_n.select_dtypes(include=['float64', 'int64']).columns

#Applying StandardScaler
standard_scaler = StandardScaler()
merged_n_standard = merged_n.copy()
merged_n_standard[numerical_cols] = standard_scaler.fit_transform(merged_n[numerical_cols])

MinMaxScaler

In [None]:
minmax_scaler = MinMaxScaler()
merged_n_minmax = merged_n.copy()
merged_n_minmax[numerical_cols] = minmax_scaler.fit_transform(merged_n[numerical_cols])

Data Preparation

Train/Test Split

In [None]:
end_of_october = merged_n.index.get_loc('2013-10-31').stop - 1
end_of_october

7127

In [None]:
X = merged_n_standard.drop('Total Household Consumption (N)', axis=1).values
y = merged_n_standard['Total Household Consumption (N)'].values

#Data generation function
def generate_dataset(X, y, time_steps=168, out_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps - out_steps):
        v = X[i:i + time_steps]
        Xs.append(v)
        ys.append(y[i + time_steps:i + time_steps + out_steps])
    return np.array(Xs), np.array(ys)

TIME_STEPS = 24
OUT_STEPS = 1

X_data, y_data = generate_dataset(X, y, TIME_STEPS, OUT_STEPS)

split_index = end_of_october - TIME_STEPS + 1

X_train = X_data[:split_index]
X_test = X_data[split_index:]

y_train = y_data[:split_index]
y_test = y_data[split_index:]

LSTM Build-up

In [None]:
from sklearn.metrics import make_scorer

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_elements = y_true != 0
    mape = np.mean(np.abs((y_true[non_zero_elements] - y_pred[non_zero_elements]) / y_true[non_zero_elements])) * 100
    return mape

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [None]:
def create_model(optimizer='adam', lstm_units=50, dropout_rate=0.0):
    model = Sequential()
    model.add(LSTM(lstm_units, activation='tanh', recurrent_activation='sigmoid',
                   input_shape=(TIME_STEPS, X_data.shape[2]), return_sequences=True, dropout=dropout_rate))
    model.add(LSTM(lstm_units, activation='tanh', recurrent_activation='sigmoid', dropout=dropout_rate))
    model.add(Dense(OUT_STEPS))
    model.compile(optimizer=optimizer, loss='mae')
    return model

In [None]:
#KerasRegressor with parameters
regressor = KerasRegressor(build_fn=create_model, optimizer='adam', lstm_units=50, dropout_rate=0.0, verbose=0)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'lstm_units': [70, 90, 110],
    'dropout_rate': [0.0, 0.1],
    'optimizer': ['adam'],
    'batch_size': [8, 16],
    'epochs': [20, 30, 40]
}

#Random search setup
random_search = RandomizedSearchCV(estimator=regressor,
                                   param_distributions=param_distributions,
                                   n_iter=20,
                                   scoring=mape_scorer,
                                   cv=5,
                                   verbose=1)

random_search_result = random_search.fit(X_train, y_train)

#Results
print(f"Best score (negative MAPE): {random_search_result.best_score_}")
print(f"Best hyperparameters: {random_search_result.best_params_}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y =

Best score (negative MAPE): -81.40062332798607
Best hyperparameters: {'optimizer': 'adam', 'lstm_units': 90, 'epochs': 30, 'dropout_rate': 0.0, 'batch_size': 8}


In [None]:
#Converting results to a DataFrame
results_df = pd.DataFrame(random_search_result.cv_results_)

results_df