In [1]:
import pandas as pd
import numpy as np
import torch as t
import plotly.graph_objects as go
import seaborn as sns

In [131]:
# load in clean dataset
clean_data = pd.read_csv(r"/home/tobi/Desktop/Capstone/EnergyDemandForecasting/src/Saved/Datasets/clean_training.csv", index_col=0)
clean_data.index = pd.to_datetime(clean_data.index)
clean_data.loc[:,"HourlyPrecipitation"] = clean_data["HourlyPrecipitation"].replace({np.nan:"None"})
display(clean_data)

Unnamed: 0,Energy Demand (MWH),HourlyDryBulbTemperature,HourlyDewPointTemperature,HourlyStationPressure,HourlyPrecipitation,HourlyWindSpeed,Energy Price (cents/KWH),Labor Force Level,Civilian Noninstitutional Population,Labor Force Participation
2018-06-19 05:00:00,7221.0,78.0,72.0,29.62,,6.000000,19.28,4272100.0,7105823.0,60.1
2018-06-19 06:00:00,6911.0,79.0,70.0,29.63,,6.000000,19.28,4272100.0,7105823.0,60.1
2018-06-19 07:00:00,6691.0,81.0,69.0,29.63,,3.544254,19.28,4272100.0,7105823.0,60.1
2018-06-19 08:00:00,6582.0,82.0,67.0,29.63,,3.789006,19.28,4272100.0,7105823.0,60.1
2018-06-19 09:00:00,6600.0,81.0,69.0,29.65,,7.000000,19.28,4272100.0,7105823.0,60.1
...,...,...,...,...,...,...,...,...,...,...
2023-08-05 14:00:00,6168.0,84.0,58.0,29.81,,6.000000,22.33,4165918.0,6747734.0,61.7
2023-08-05 15:00:00,6431.0,83.0,60.0,29.82,,7.000000,22.33,4165918.0,6747734.0,61.7
2023-08-05 16:00:00,6599.0,82.0,55.0,29.82,,0.000000,22.33,4165918.0,6747734.0,61.7
2023-08-05 17:00:00,6745.0,81.0,56.0,29.82,,5.000000,22.33,4165918.0,6747734.0,61.7


In [132]:
input_data = clean_data.copy()
output_data = clean_data.copy()

In [133]:
# encode hour of the day, day of the week, and day of the year into a new dataframe
input_time = pd.DataFrame(data={"Hour of Day":input_data.index.hour, "Day of Week":input_data.index.dayofweek, "Day of Year":input_data.index.dayofyear}, index=input_data.index)
# input_data.loc[:,"Hour of Day"] = input_data.index.hour
# input_data.loc[:,"Day of Week"] = input_data.index.dayofweek
# input_data.loc[:,"Day of Year"] = input_data.index.dayofyear
display(input_data)
display(input_time)

# should also try doing this without sine and cosine and compare the difference

Unnamed: 0,Energy Demand (MWH),HourlyDryBulbTemperature,HourlyDewPointTemperature,HourlyStationPressure,HourlyPrecipitation,HourlyWindSpeed,Energy Price (cents/KWH),Labor Force Level,Civilian Noninstitutional Population,Labor Force Participation
2018-06-19 05:00:00,7221.0,78.0,72.0,29.62,,6.000000,19.28,4272100.0,7105823.0,60.1
2018-06-19 06:00:00,6911.0,79.0,70.0,29.63,,6.000000,19.28,4272100.0,7105823.0,60.1
2018-06-19 07:00:00,6691.0,81.0,69.0,29.63,,3.544254,19.28,4272100.0,7105823.0,60.1
2018-06-19 08:00:00,6582.0,82.0,67.0,29.63,,3.789006,19.28,4272100.0,7105823.0,60.1
2018-06-19 09:00:00,6600.0,81.0,69.0,29.65,,7.000000,19.28,4272100.0,7105823.0,60.1
...,...,...,...,...,...,...,...,...,...,...
2023-08-05 14:00:00,6168.0,84.0,58.0,29.81,,6.000000,22.33,4165918.0,6747734.0,61.7
2023-08-05 15:00:00,6431.0,83.0,60.0,29.82,,7.000000,22.33,4165918.0,6747734.0,61.7
2023-08-05 16:00:00,6599.0,82.0,55.0,29.82,,0.000000,22.33,4165918.0,6747734.0,61.7
2023-08-05 17:00:00,6745.0,81.0,56.0,29.82,,5.000000,22.33,4165918.0,6747734.0,61.7


Unnamed: 0,Hour of Day,Day of Week,Day of Year
2018-06-19 05:00:00,5,1,170
2018-06-19 06:00:00,6,1,170
2018-06-19 07:00:00,7,1,170
2018-06-19 08:00:00,8,1,170
2018-06-19 09:00:00,9,1,170
...,...,...,...
2023-08-05 14:00:00,14,5,217
2023-08-05 15:00:00,15,5,217
2023-08-05 16:00:00,16,5,217
2023-08-05 17:00:00,17,5,217


In [16]:
if t.cuda.is_available():
    device = t.device("cuda")
    print("PyTorch is using:", t.cuda.get_device_name(0))
else:
    print("PyTorch is using CPU")

PyTorch is using: Radeon RX 7900 XTX


Encode Categorical Variables

In [134]:
input_data = pd.get_dummies(input_data, drop_first=True).astype("float32")
output_data = pd.get_dummies(output_data, drop_first=True).astype("float32")

Normalize features into [0,1]

In [135]:
input_min_vals = np.min(input_data, axis=0)
input_max_vals = np.max(input_data, axis=0)
input_time_min_vals = np.min(input_time, axis=0)
input_time_max_vals = np.max(input_time, axis=0)
output_min_vals = np.min(output_data, axis=0)
output_max_vals = np.max(output_data, axis=0)

# Normalize each column to be in the range [0, 1]
input_data = (input_data - input_min_vals) / (input_max_vals - input_min_vals)
input_time = (input_time - input_time_min_vals) / (input_time_max_vals - input_time_min_vals)
output_data = (output_data - output_min_vals) / (output_max_vals - output_min_vals)

In [196]:
def format_data_lstm(input_data:pd.DataFrame, output_data:pd.DataFrame, sequence_length:int, batch_size:int):
    pass


# formatted dataset: (N x B x S x K)
B = 8   # Batch size
S = 168   # Sequence length
K = input_data.shape[1]  # Number of features

# Reshape data to have dimensions (N x B x S x K)
x = input_data.values
y = output_data.values
# reshaped_data = np.tile(data, (N,B,S,K)) #.reshape(N, B, S, K)

# Calculate the number of sequences of length S that can be produced
num_sequences = x.shape[0] - S

# Initialize an empty list to store the groups
x_inputs = []
y_outputs = []

# Iterate over the array to create groups
for i in range(num_sequences):
    input = x[i:i+S]
    output = y[i+S]
    x_inputs.append(input)
    y_outputs.append(output)

# Convert the list of groups to a NumPy array
x = np.array(x_inputs)
y = np.array(y_outputs)
x = x[(x.shape[0]%B):]
y = y[(y.shape[0]%B):]
x = np.array(np.split(x, x.shape[0]//B, axis=0))
y = np.array(np.split(y, y.shape[0]//B, axis=0))

# display(data.shape)
display(x.shape)
display(y.shape)
# print(formatted_data)

(5599, 8, 168, 15)

(5599, 8, 12)

Format using DataLoader

In [136]:
# formatted dataset: (N x B x S x K)
B = 8   # Batch size
S = 168   # Sequence length
K = input_data.shape[1]  # Number of features

# Calculate the number of sequences of length S that can be produced
num_sequences = x.shape[0] - S

# Reshape data to have dimensions (N x B x S x K)
x = input_data.values
x_time = input_time.values
y = output_data.values

# Initialize an empty list to store the groups
x_inputs = []
x_time_inputs = []
y_outputs = []

# Iterate over the array to create groups
for i in range(num_sequences):
    input = x[i:i+S]
    time_input = x_time[i:i+S]
    output = y[i+S]
    x_inputs.append(input)
    y_outputs.append(output)
    x_time_inputs.append(time_input)

x_inputs = t.Tensor(np.array(x_inputs))
x_time_inputs = t.Tensor(np.array(x_time_inputs))
y_outputs = t.Tensor(np.array(y_outputs))

validation_size = int(np.floor(x_inputs.shape[0]*0.1))
# define train_loader from 90% of training data
train_dataset = t.utils.data.TensorDataset(x_inputs[:-validation_size], x_time_inputs[:-validation_size], y_outputs[:-validation_size])
train_loader = t.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=False)

# define validation_loader from 10% of training data
validation_dataset = t.utils.data.TensorDataset(x_time_inputs[-validation_size:], y_outputs[-validation_size:])
validation_loader = t.utils.data.DataLoader(validation_dataset, batch_size=1, shuffle=False)
initial_validation_time = input_time.index[-validation_size]

In [137]:
for x_input, time_input, output in train_loader:
    print(x_input.shape)
    print(time_input.shape)
    print(output.shape)
    break

# Test formatting
# y_index = np.random.randint(y.shape[0]*y.shape[1]-1)
y_index = np.random.randint(y.shape[0]-1)
y_sample_index = y_index//B
y_batch_index = y_index%B

x_index = y_index + 1
x_sample_index = x_index//B
x_batch_index = x_index%B

display(train_loader.dataset[x_index][0][-1].numpy() * (input_max_vals-input_min_vals) + input_min_vals)
display(train_loader.dataset[y_index][2].numpy() * (output_max_vals-output_min_vals) + output_min_vals)

torch.Size([8, 168, 12])
torch.Size([8, 168, 3])
torch.Size([8, 12])


Energy Demand (MWH)                     8.325000e+03
HourlyDryBulbTemperature                7.200000e+01
HourlyDewPointTemperature               7.000000e+01
HourlyStationPressure                   2.958000e+01
HourlyWindSpeed                         0.000000e+00
Energy Price (cents/KWH)                1.843000e+01
Labor Force Level                       4.077301e+06
Civilian Noninstitutional Population    7.061655e+06
Labor Force Participation               5.770000e+01
HourlyPrecipitation_Light Rain          0.000000e+00
HourlyPrecipitation_Medium Rain         0.000000e+00
HourlyPrecipitation_None                0.000000e+00
dtype: float32

Energy Demand (MWH)                     8.325000e+03
HourlyDryBulbTemperature                7.200000e+01
HourlyDewPointTemperature               7.000000e+01
HourlyStationPressure                   2.958000e+01
HourlyWindSpeed                         0.000000e+00
Energy Price (cents/KWH)                1.843000e+01
Labor Force Level                       4.077301e+06
Civilian Noninstitutional Population    7.061655e+06
Labor Force Participation               5.770000e+01
HourlyPrecipitation_Light Rain          0.000000e+00
HourlyPrecipitation_Medium Rain         0.000000e+00
HourlyPrecipitation_None                0.000000e+00
dtype: float32

In [197]:
display(clean_data.iloc[S+S:])

Unnamed: 0,Energy Demand (MWH),HourlyDryBulbTemperature,HourlyDewPointTemperature,HourlyStationPressure,HourlyPrecipitation,HourlyWindSpeed,Energy Price (cents/KWH),Labor Force Level,Civilian Noninstitutional Population,Labor Force Participation
2018-07-03 05:00:00,8149.0,80.0,74.0,30.01,,0.0,19.37,4271649.0,7107053.0,60.1
2018-07-03 06:00:00,7774.0,82.0,75.0,30.01,,0.0,19.37,4271649.0,7107053.0,60.1
2018-07-03 07:00:00,7527.0,84.0,75.0,30.01,,0.0,19.37,4271649.0,7107053.0,60.1
2018-07-03 08:00:00,7374.0,86.0,76.0,30.01,,0.0,19.37,4271649.0,7107053.0,60.1
2018-07-03 09:00:00,7355.0,89.0,76.0,30.03,,3.0,19.37,4271649.0,7107053.0,60.1
...,...,...,...,...,...,...,...,...,...,...
2023-08-05 14:00:00,6168.0,84.0,58.0,29.81,,6.0,22.33,4165918.0,6747734.0,61.7
2023-08-05 15:00:00,6431.0,83.0,60.0,29.82,,7.0,22.33,4165918.0,6747734.0,61.7
2023-08-05 16:00:00,6599.0,82.0,55.0,29.82,,0.0,22.33,4165918.0,6747734.0,61.7
2023-08-05 17:00:00,6745.0,81.0,56.0,29.82,,5.0,22.33,4165918.0,6747734.0,61.7


In [31]:
# Test formatting
y_index = np.random.randint(y.shape[0]*y.shape[1]-1)
y_sample_index = y_index//B
y_batch_index = y_index%B

x_index = y_index + 1
x_sample_index = x_index//B
x_batch_index = x_index%B

display(x[x_sample_index,x_batch_index,-1] * (input_max_vals-input_min_vals) + input_min_vals)
display(y[y_sample_index,y_batch_index] * (output_max_vals-output_min_vals) + output_min_vals)

IndexError: too many indices for array: array is 2-dimensional, but 3 were indexed

In [74]:
import numpy as np

class LSTM(t.nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout:float = 0):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = t.nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.dropout = t.nn.Dropout(dropout)
        self.fc = t.nn.Sequential(
            # t.nn.Linear(hidden_size, hidden_size),
            t.nn.Linear(hidden_size, output_size),
            t.nn.Softplus() 
        ) 
        self.c0 = None
        self.h0 = None

    def forward(self, x_observed, x_time, save_cell_state:bool=False, bayesian_predict:bool=False):
        x = t.cat([x_observed, x_time], dim=-1)
        device = next(self.parameters()).device
        if save_cell_state: # Use previous hidden state and cell state
            if self.c0 is None: # no previous states so need to initialize them
                self.h0 = t.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
                self.c0 = t.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
            h0 = self.c0
            c0 = self.h0
        else: # Initialize hidden state and cell state
            h0 = t.zeros(self.num_layers, x.size(0), self.hidden_size, device=device)
            c0 = t.zeros(self.num_layers, x.size(0), self.hidden_size, device=device)


        # Forward propagate LSTM
        out, (h0, c0) = self.lstm(x, (h0, c0))

        # Decode the hidden state of the last time step
        if self.eval or bayesian_predict:
            out = self.dropout(out)
        out = self.fc(out[:, -1, :])
        return out
    
    def predict_future(self, initial_input, input_order): pass


# Example usage:
display(train_loader.dataset[0][0].shape)
input_size = train_loader.dataset[0][0].shape[-1] + train_loader.dataset[0][1].shape[-1]
hidden_size = 64  # Number of LSTM units (hidden size)
num_layers = 1  # Number of LSTM layers
output_size = train_loader.dataset[0][2].shape[-1]
seq_length = S  # Length of input sequence (assuming hourly data)

# Forward pass
model = LSTM(input_size, hidden_size, num_layers, output_size)
output = model(train_loader.dataset[0][0][None,:,:], train_loader.dataset[0][1][None,:,:])
print("Output shape:", output.shape)

torch.Size([168, 12])

Output shape: torch.Size([1, 12])


Train

In [45]:
display(len(validation_loader))

4479

In [110]:
# set device
device = "cuda"
# device = "cpu"
# Initialize the LSTM model
model = LSTM(input_size, hidden_size, num_layers, output_size).to(device=device)

# pull out last 10% of data to use for early stopping validation

# define optimizer
criterion = t.nn.MSELoss(reduction="none")
optimizer = t.optim.Adam(model.parameters(), lr=0.001, weight_decay=0)
# x_device = t.Tensor(x).to(device=device)
# y_device = t.Tensor(y).to(device=device)
weights = t.Tensor([1]+[0.1]*(train_loader.dataset[0][2].shape[-1]-1)).to(device=device)

for epoch in range(10):
    # for b in range(x.shape[0]):
    losses = []
    for b, (inputs, time_inputs, targets) in enumerate(train_loader):
        inputs = inputs.to(device=device)
        time_inputs = time_inputs.to(device=device)
        targets = targets.to(device=device)
        # outputs = model(x_device[b])
        outputs = model(inputs, time_inputs)
        optimizer.zero_grad()
        loss = criterion(targets, outputs) * 1000
        
        loss = loss * weights[None,:]
        loss = t.mean(loss)
        # print(loss.item())
        losses.append(loss.item())
        # loss = criterion(y_gpu[b], outputs)
        loss.backward()
        # Print gradients of model parameters
        if (epoch % 25 == 0) & (b == 0):
            for name, param in model.named_parameters():
                if param.grad is not None:
                    print(f"Gradient of {name}:")
                    print(param.grad)
                    print(param)
        optimizer.step()
        # print("Batch {}, Loss: {:.2f}".format(b+1, loss.item()))
    # print("NEW EPOCH\n\n\n")
    # if epoch == 1: break
    losses = np.mean(losses)
    print("Epoch {}, Loss: {}".format(epoch+1, losses.item()))

Gradient of lstm.weight_ih_l0:
tensor([[-1.6588e-02, -4.6540e-02, -4.8634e-02,  ..., -1.8969e-02,
         -1.0849e-02, -3.1727e-02],
        [ 1.0101e-03,  1.0575e-03,  9.6208e-04,  ...,  4.4340e-04,
          1.0418e-04,  7.2900e-04],
        [ 1.9117e-02,  5.6601e-02,  5.9159e-02,  ...,  2.2999e-02,
          1.3392e-02,  3.8444e-02],
        ...,
        [-3.8378e-02, -1.3294e-01, -1.3709e-01,  ..., -6.2205e-02,
         -3.0634e-02, -8.8470e-02],
        [ 4.7944e-02,  1.5716e-01,  1.6141e-01,  ...,  7.3618e-02,
          3.5842e-02,  1.0452e-01],
        [ 1.0967e-02,  4.1576e-02,  4.2596e-02,  ...,  2.0406e-02,
          9.6233e-03,  2.7321e-02]], device='cuda:0')
Parameter containing:
tensor([[-0.0878, -0.0760,  0.0352,  ..., -0.0024, -0.0603,  0.0099],
        [-0.1065, -0.0567,  0.0842,  ...,  0.0141, -0.1165,  0.0780],
        [-0.0496,  0.1164,  0.0099,  ..., -0.0299,  0.0905, -0.0729],
        ...,
        [-0.1036,  0.0126,  0.1209,  ..., -0.0426, -0.1155, -0.0812],
     

In [154]:
model = model.to(device=device)
predictions = []
with t.no_grad():
    # validation
    losses = []
    inputs = train_loader.dataset[-1][0][None,:,:].to(device=device)
    for b, (time_inputs, targets) in enumerate(validation_loader):
        # print(train_loader.dataset[-1][0].shape)
        # print(inputs.shape)
        time_inputs = time_inputs.to(device=device)
        targets = targets.to(device=device)
        # print(inputs.device)
        # print(time_inputs.device)
        # print(model.device)
        output = model(inputs, time_inputs)
        predictions.append(output.cpu().numpy())
        loss = criterion(targets, output) * 1000
        
        loss = loss * weights[None,:]
        loss = t.mean(loss)
        losses.append(loss.item())

        inputs = inputs[:,1:]
        # print(inputs.shape)
        # print(output * (output_max_vals.iloc[0]-output_min_vals.iloc[0]) + output_min_vals.iloc[0])
        # print(output[:,None,:].shape)
        # print(output.device)
        output = output
        inputs = t.cat([inputs, output[:,None,:]], dim=1)
        # print(inputs.shape)

    losses = np.mean(losses)
    predictions = np.array(predictions)
    print("Epoch {}, Loss: {}".format(epoch+1, losses.item()))

Epoch 10, Loss: 8.319259101789854


In [155]:
# tensor_dataset = torch.cat([batch for batch in data_loader])
display(predictions.shape)
dependent_variable = "Energy Demand (MWH)"
prediction_data = (predictions[:,:,0] * (output_max_vals[dependent_variable]-output_min_vals[dependent_variable]) + output_min_vals[dependent_variable]).flatten()
display(prediction_data)

val_data = t.cat([batch[1] for batch in validation_loader])
val_data = (val_data[:,0].cpu().numpy() * (output_max_vals[dependent_variable]-output_min_vals[dependent_variable]) + output_min_vals[dependent_variable]).flatten()
display(val_data)

trace1 = go.Scatter(x = np.array(range(0, val_data.shape[0])), y=val_data, mode="lines")
trace2 = go.Scatter(x = np.array(range(0, prediction_data.shape[0])), y=prediction_data, mode="lines")

fig = go.Figure([trace1, trace2])
fig.show()

(4479, 1, 12)

array([5229.2383, 4957.4023, 4690.757 , ..., 4830.7925, 5046.7363,
       5254.698 ], dtype=float32)

array([5012., 4696., 4428., ..., 6599., 6745., 6810.], dtype=float32)

In [None]:
predictions = []
with t.no_grad():
    # validation
    losses = []
    for b, (inputs, time_inputs, targets) in enumerate(train_loader):
        # print(train_loader.dataset[-1][0].shape)
        inputs = train_loader.dataset[-1][0][None,:,:].to(device=device)
        # print(inputs.shape)
        time_inputs = time_inputs.to(device=device)
        targets = targets.to(device=device)
        # print(inputs.device)
        # print(time_inputs.device)
        # print(model.device)
        output = model(inputs, time_inputs)
        predictions.append(output.cpu().numpy())
        loss = criterion(targets, output) * 1000
        
        loss = loss * weights[None,:]
        loss = t.mean(loss)
        losses.append(loss.item())

        inputs = inputs[:,1:]
        # print(inputs.shape)
        # print(output * (output_max_vals.iloc[0]-output_min_vals.iloc[0]) + output_min_vals.iloc[0])
        # print(output[:,None,:].shape)
        # print(output.device)
        output = output
        inputs = t.cat([inputs, output[:,None,:]], dim=1)
        # print(inputs.shape)

    losses = np.mean(losses)
    predictions = np.array(predictions)
    print("Epoch {}, Loss: {}".format(epoch+1, losses.item()))

In [None]:
# plot training data. Trying to troubleshoot validation.

Define validation function

In [85]:
# initial_input = train_loader.dataset[-1][0]
# display(train_loader.dataset[-1][1].numpy().shape)
# validation_time = train_loader.dataset[-1][1][-1].numpy() * (input_time_max_vals-input_time_min_vals) + input_time_min_vals

display(initial_validation_time)

# Define the number of periods (hours)
N = len(validation_loader)  # 7 days worth of hourly data

# Generate hourly timestamps
validation_time = [initial_validation_time + pd.Timedelta(hours=i) for i in range(N)]
validation_time = pd.DatetimeIndex(validation_time)
validation_time = pd.DataFrame(data={"Hour of Day":validation_time.hour, "Day of Week":validation_time.dayofweek, "Day of Year":validation_time.dayofyear})

Timestamp('2023-01-31 04:00:00')

DatetimeIndex(['2023-01-31 04:00:00', '2023-01-31 05:00:00',
               '2023-01-31 06:00:00', '2023-01-31 07:00:00',
               '2023-01-31 08:00:00', '2023-01-31 09:00:00',
               '2023-01-31 10:00:00', '2023-01-31 11:00:00',
               '2023-01-31 12:00:00', '2023-01-31 13:00:00',
               ...
               '2023-08-05 09:00:00', '2023-08-05 10:00:00',
               '2023-08-05 11:00:00', '2023-08-05 12:00:00',
               '2023-08-05 13:00:00', '2023-08-05 14:00:00',
               '2023-08-05 15:00:00', '2023-08-05 16:00:00',
               '2023-08-05 17:00:00', '2023-08-05 18:00:00'],
              dtype='datetime64[ns]', length=4479, freq=None)

Unnamed: 0,Hour of Day,Day of Week,Day of Year
0,4,1,31
1,5,1,31
2,6,1,31
3,7,1,31
4,8,1,31
...,...,...,...
4474,14,5,217
4475,15,5,217
4476,16,5,217
4477,17,5,217


4479

Rolling Cross Validation