In [78]:
import pandas as pd
import datetime as dt
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

monthly = pd.read_csv('../../../Dataset/Boulder_Monthly.csv')
monthly.drop(columns={'Unnamed: 0'}, inplace=True)
monthly.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Energy__kWh_   55 non-null     float64
 1   Month          55 non-null     object 
 2   Minimum T      55 non-null     int64  
 3   Maximum T      55 non-null     int64  
 4   Snow           55 non-null     float64
 5   Precipitation  55 non-null     float64
dtypes: float64(3), int64(2), object(1)
memory usage: 2.7+ KB


In [79]:
# Define the columns we need to scale and we need to use for One-Hot Encoding
columns_to_scale = ['Energy__kWh_', 'Minimum T', 'Maximum T', 'Snow', 'Precipitation']
categorical_columns = ['Month']

# MinMax scaling for numerical columns and One-hot encoding for categorical columns
scaler = MinMaxScaler()
monthly_scaled = monthly.copy()
monthly_scaled[columns_to_scale] = scaler.fit_transform(monthly[columns_to_scale])

# One-hot encoding for categorical columns
onehot_encoder = OneHotEncoder(sparse=False)
categorical_encoded = onehot_encoder.fit_transform(monthly[categorical_columns])

# Get the feature names from the encoder
encoded_columns = []
for col, values in zip(categorical_columns, onehot_encoder.categories_):
    encoded_columns.extend([f'{col}_{value}' for value in values])

# Create DataFrame with encoded columns
categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=encoded_columns)

# Concatenate the new encoded columns to the original DataFrame
monthly_scaled = pd.concat([monthly_scaled, categorical_encoded_df], axis=1)

# Drop the original categorical columns
monthly_scaled = monthly_scaled.drop(categorical_columns, axis=1)


# Split the dataset into training, validation, and testing sets
def split_dataset(df, train_ratio, val_ratio):

    total_size = len(df)
    train_size = int(train_ratio * total_size)
    val_size = int(val_ratio * total_size)
    test_size = total_size - train_size - val_size

    train_df = df[:train_size]
    val_df = df[train_size:train_size + val_size]
    test_df = df[train_size + val_size:]

    assert len(train_df) + len(val_df) + len(test_df) == total_size, "Dataset not split correctly."

    print(f'Training split ratio:   {round(len(train_df) / len(df), 3)}')
    print(f'Validation split ratio: {round(len(val_df) / len(df), 3)}')
    print(f'Testing split ratio:    {round(len(test_df) / len(df), 3)}')
    print("\nShapes of the datasets:")
    print(train_df.shape, val_df.shape, test_df.shape)

    return train_df, val_df, test_df

train_monthly_scaled, val_monthly_scaled, test_monthly_scaled = split_dataset(monthly_scaled, train_ratio=0.7, val_ratio=0.2)

Training split ratio:   0.691
Validation split ratio: 0.2
Testing split ratio:    0.109

Shapes of the datasets:
(38, 17) (11, 17) (6, 17)


In [80]:
# Reshape the data
def create_sequences(data, sequence_length):
    inputs = []
    targets = []
    for i in range(len(data) - sequence_length):
        sequence = data.iloc[i:i + sequence_length].values
        target = data.iloc[i + sequence_length]['Energy__kWh_']  # Predict the next value
        inputs.append(sequence)
        targets.append(target)

    inputs_array = np.array(inputs)
    targets_array = np.array(targets)
    
    print(f'Dataset split into sequences:')
    print(f'Sequences shape: {inputs_array.shape}')
    print(f'Targets shape: {targets_array.shape}\n')

    return np.array(inputs), np.array(targets)

sequence_length = 3
num_features = len(monthly_scaled.columns)

# Create the training, validation, and test data sequences
train_data_inputs, train_data_targets = create_sequences(train_monthly_scaled, sequence_length)
val_data_inputs, val_data_targets = create_sequences(val_monthly_scaled, sequence_length)
test_data_inputs, test_data_targets = create_sequences(test_monthly_scaled, sequence_length)

Dataset split into sequences:
Sequences shape: (35, 3, 17)
Targets shape: (35,)

Dataset split into sequences:
Sequences shape: (8, 3, 17)
Targets shape: (8,)

Dataset split into sequences:
Sequences shape: (3, 3, 17)
Targets shape: (3,)



In [81]:
# The input Datasets must have this input shape (-1, sequence_length, num_features)
train_data_inputs = train_data_inputs.reshape((-1, sequence_length, num_features))
val_data_inputs = val_data_inputs.reshape((-1, sequence_length, num_features))
test_data_inputs = test_data_inputs.reshape((-1, sequence_length, num_features))

train_data_inputs.shape, val_data_inputs.shape, test_data_inputs.shape

((35, 3, 17), (8, 3, 17), (3, 3, 17))

In [82]:
class ScaledDotProductAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(ScaledDotProductAttention, self).__init__(**kwargs)

    def call(self, queries, keys, values, mask):
        # Calculate the dot product of query and key matrices
        matmul_qk = tf.matmul(queries, keys, transpose_b=True) # Q * K.T

        # Get the dimension of the key matrix and cast to float32
        d_k = keys.shape[-1]
        d_k = tf.cast(d_k, tf.float32)

        # Scale the attention scores by the square root of the key dimension / Scoring the queries against the keys after transposing the latter, and scaling
        scaled_attention_scores = matmul_qk / tf.math.sqrt(d_k)

        # Apply the mask to the attention scores (if mask is provided)
        if mask is not None:
            scaled_attention_scores += (mask * -1e9)

        # Computing the weights by a softmax operation
        attention_weights = tf.nn.softmax(scaled_attention_scores, axis=-1)

        # Calculate the output by multiplying attention weights with value matrix
        output = tf.matmul(attention_weights, values)

        return output
    
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, h, d_k, d_v, d_model, **kwargs):
        super(MultiHeadAttention, self).__init__(**kwargs)
        self.h = h
        self.d_k = d_k
        self.d_v = d_v
        self.d_model = d_model

        # Initialize linear layers for projections
        self.query_projection = tf.keras.layers.Dense(units=(h * d_k), activation=None)
        self.key_projection = tf.keras.layers.Dense(units=(h * d_k), activation=None)
        self.value_projection = tf.keras.layers.Dense(units=(h * d_v), activation=None)

        # Initialize the final linear layer
        self.output_projection = tf.keras.layers.Dense(units=d_model, activation=None)

        # Initialize attention layer
        self.attention = ScaledDotProductAttention()

    def call(self, queries, keys, values, mask):
        batch_size = tf.shape(queries)[0]

        # Linear projections
        queries = tf.reshape(self.query_projection(queries), (batch_size, -1, self.h, self.d_k))
        keys = tf.reshape(self.key_projection(keys), (batch_size, -1, self.h, self.d_k))
        values = tf.reshape(self.value_projection(values), (batch_size, -1, self.h, self.d_v))

        # Transpose to have dimensions [batch_size, num_heads, seq_len, d_k/d_v]
        queries = tf.transpose(queries, perm=[0, 2, 1, 3])
        keys = tf.transpose(keys, perm=[0, 2, 1, 3])
        values = tf.transpose(values, perm=[0, 2, 1, 3])

        # Apply attention
        attention_output = self.attention(queries, keys, values, mask)

        # Transpose and concatenate to get the final output
        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
        attention_output = tf.reshape(attention_output, (batch_size, -1, self.h * self.d_v))

        # Apply final linear layer
        output = self.output_projection(attention_output)

        return output
    

class PositionwiseFeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff, dropout_rate, **kwargs):
        super(PositionwiseFeedForward, self).__init__(**kwargs)

        # Feedforward neural network with a ReLU activation
        self.ffn = tf.keras.Sequential([tf.keras.layers.Dense(d_ff, activation='relu'),tf.keras.layers.Dense(d_model, activation=None)])

        # Dropout layer to prevent overfitting
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, inputs):
        # Pass the inputs through the feedforward neural network
        ff_output = self.ffn(inputs)

        # Apply dropout to the output
        ff_output = self.dropout(ff_output)

        return ff_output

In [83]:
def Encoder(encoder_input, num_heads, d_ff, dropout_rate, encoder_mask):
    inputs = encoder_input
    
    # Extract the size of the model from the input shape
    d_model = inputs.shape[-1]

    # Multi-Head Self Attention
    attention_output = MultiHeadAttention(h=num_heads,
                                 d_k=d_model // num_heads,
                                 d_v=d_model // num_heads,
                                 d_model=d_model)(inputs, inputs, inputs, mask=encoder_mask)
    # Apply dropout for regularization
    attention_output = tf.keras.layers.Dropout(dropout_rate)(attention_output)

    # Add and Normalize step after Multi-Head Self Attention
    norm_attention_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs + attention_output)

    # Feedforward Neural Network
    ffn = PositionwiseFeedForward(
        d_model=d_model,
        d_ff=d_ff,
        dropout_rate=dropout_rate
    )
    ff_output = ffn(norm_attention_output)
    # Apply dropout for regularization
    ff_output = tf.keras.layers.Dropout(dropout_rate)(ff_output)

    # Add and Normalize step after the Feedforward Neural Network
    encoder_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(norm_attention_output + ff_output)

    return encoder_output

In [84]:
def Decoder(inputs, encoder_output, num_heads, d_ff, dropout_rate, decoder_mask):
    inputs = inputs

    # Extract the size of the model from the input shape
    d_model = inputs.shape[-1]

    # Masked Self-Attention
    masked_attention_output = MultiHeadAttention(
                                                 h=num_heads,
                                                 d_k=d_model // num_heads,
                                                 d_v=d_model // num_heads,
                                                 d_model=d_model
                                                )(inputs, inputs, inputs, mask=decoder_mask)
    # Apply dropout for regularization
    masked_attention_output = tf.keras.layers.Dropout(dropout_rate)(masked_attention_output)

    # Add and Normalize the Masked Self-Attention output
    norm_masked_attention_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(masked_attention_output + inputs)

    # Cross-Attention with Encoder Output
    attention_output = MultiHeadAttention(
                                          h=num_heads,
                                          d_k=d_model // num_heads,
                                          d_v=d_model // num_heads,
                                          d_model=d_model
                                         )(norm_masked_attention_output, encoder_output, encoder_output, mask=decoder_mask)  # < ----- try and mask = mask
    # Apply dropout for regularization
    attention_output = tf.keras.layers.Dropout(dropout_rate)(attention_output)

    # Add and Normalize the Cross-Attention output
    attention_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(norm_masked_attention_output + attention_output)

    # Feedforward Neural Network
    ffn = PositionwiseFeedForward(
                                  d_model=d_model,
                                  d_ff=d_ff,
                                  dropout_rate=dropout_rate
                                 )
    ff_output = ffn(attention_output)
    # Apply dropout for regularization
    ff_output = tf.keras.layers.Dropout(dropout_rate)(ff_output)

    # Add and Normalize
    decoder_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention_output + ff_output)

    return decoder_output

In [85]:
def TransformerModel(input_shape, num_heads, d_ff, num_layers, dropout_rate, encoder_mask, decoder_mask):
    # Define the input layer
    inputs = tf.keras.layers.Input(shape=input_shape)

    # Encoder
    encoder = inputs
    for _ in range(num_layers):
        # Apply the Encoder function to the input for each layer
        encoder_output = Encoder(encoder, num_heads, d_ff, dropout_rate, encoder_mask)

    # Decoder
    decoder = encoder_output
    for _ in range(num_layers):
        # Apply the Decoder function to the encoder output for each layer
        decoder_output = Decoder(decoder, encoder, num_heads, d_ff, dropout_rate, decoder_mask)

    # Generate the final output with a TimeDistributed Dense layer
    pull_time_window = tf.keras.layers.GlobalAveragePooling1D(data_format='channels_first')(decoder_output)
    print("Shape of pull_time_window:", pull_time_window.shape)

    pull_time_window = tf.keras.layers.Dropout(0.1)(pull_time_window)
    outputs = tf.keras.layers.Dense(1, activation='linear')(pull_time_window)
    #outputs = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=1))(decoder_output)

    # Build the Keras model using the specified inputs and outputs
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    return model

In [87]:
def TransformerModel(input_shape, num_heads, d_ff, num_layers, dropout_rate, encoder_mask, decoder_mask):
    # Define the input layer
    inputs = tf.keras.layers.Input(shape=input_shape)

    # Encoder
    encoder = inputs
    for _ in range(num_layers):
        # Apply the Encoder function to the input for each layer
        encoder_output = Encoder(encoder, num_heads, d_ff, dropout_rate, encoder_mask)

    # Decoder
    decoder = encoder_output
    for _ in range(num_layers):
        # Apply the Decoder function to the encoder output for each layer
        decoder_output = Decoder(decoder, encoder, num_heads, d_ff, dropout_rate, decoder_mask)

    # Generate the final output with a TimeDistributed Dense layer
    pull_time_window = tf.keras.layers.GlobalAveragePooling1D(data_format='channels_first')(decoder_output)
    pull_time_window = tf.keras.layers.Dropout(0.1)(pull_time_window)
    outputs = tf.keras.layers.Dense(1, activation='linear')(pull_time_window)
    #outputs = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=1))(decoder_output)

    # Build the Keras model using the specified inputs and outputs
    model = keras.Model(inputs=inputs, outputs=outputs)
    
    return model

In [88]:
# Define the hyperparameters of the manual model
input_shape = (sequence_length, num_features)
num_heads = 1
d_ff = 64
num_layers = 3
dropout_rate = 0.1
encoder_mask = None
decoder_mask = tf.linalg.band_part(tf.ones((sequence_length, sequence_length)), -1, 0)  # Create a lower triangular mask
decoder_mask = 1 - decoder_mask  # Invert the mask

# Create the transformer model
manul_model = TransformerModel(input_shape, num_heads, d_ff, num_layers, dropout_rate, encoder_mask, decoder_mask)

manul_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 3, 17)]      0           []                               
                                                                                                  
 multi_head_attention_56 (Multi  (None, None, 17)    1224        ['input_7[0][0]',                
 HeadAttention)                                                   'input_7[0][0]',                
                                                                  'input_7[0][0]']                
                                                                                                  
 dropout_138 (Dropout)          (None, None, 17)     0           ['multi_head_attention_56[0][0]']
                                                                                            

In [89]:
def root_mean_squared_error(y_true, y_pred):
    return tf.keras.backend.sqrt(
        tf.keras.backend.mean(
            tf.keras.backend.square(
                y_pred - y_true
            )
        ) + 1e-9
    )

# Define the learning rate for Adam optimizer
learning_rate = 0.01

# Compile the manual model
manul_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='mse',  metrics=['mae', 'mse', root_mean_squared_error])


In [90]:
# Define the parameters for training
epochs = 200
batch_size = 32

# Convert the data to float32
train_data_inputs = train_data_inputs.astype('float32')
train_data_targets = train_data_targets.astype('float32')

val_data_inputs = val_data_inputs.astype('float32')
val_data_targets = val_data_targets.astype('float32')

train_data_inputs.shape, train_data_targets.shape, val_data_inputs.shape, val_data_targets.shape

((35, 3, 17), (35,), (8, 3, 17), (8,))

In [91]:
# Train the manual model
manul_model.fit(train_data_inputs, train_data_targets,
          validation_data=(val_data_inputs, val_data_targets),
          epochs=epochs, batch_size=batch_size)

Epoch 1/200


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 7

<keras.callbacks.History at 0x2744fded0d0>

In [92]:
# Evaluate on validation set
val_metrics_manul = manul_model.evaluate(val_data_inputs, val_data_targets, return_dict=True)

# Evaluate on test set
test_metrics_manul = manul_model.evaluate(test_data_inputs, test_data_targets, return_dict=True)

# Extract individual metrics
val_loss_manul, val_mae_manul, val_mse_manul, val_rmse_manul = val_metrics_manul['loss'], val_metrics_manul['mae'], val_metrics_manul['mse'], val_metrics_manul['root_mean_squared_error']
test_loss_manul, test_mae_manul, test_mse_manul, test_rmse_manul = test_metrics_manul['loss'], test_metrics_manul['mae'], test_metrics_manul['mse'], test_metrics_manul['root_mean_squared_error']

print('\n\nManual Transformer:\n-------------------')
print(f'Validation Loss: {val_loss_manul}, Validation MSE: {val_mse_manul}, Validation MAE: {val_mae_manul}, Validation RMSE: {val_rmse_manul}')
print(f'Test Loss: {test_loss_manul}, Test MSE: {test_mse_manul}, Test MAE: {test_mae_manul}, Test RMSE: {test_rmse_manul}')




Manual Transformer:
-------------------
Validation Loss: 0.04590541124343872, Validation MSE: 0.04590541124343872, Validation MAE: 0.1526624858379364, Validation RMSE: 0.21425548195838928
Test Loss: 0.03392934426665306, Test MSE: 0.03392934426665306, Test MAE: 0.17364966869354248, Test RMSE: 0.18419919908046722


In [93]:
# Assuming manul_model.predict returns the predictions
val_predictions_manul = manul_model.predict(val_data_inputs)
test_predictions_manul = manul_model.predict(test_data_inputs)

# Calculate MAE and RMSE for validation set
val_mae_manul = np.mean(np.abs(val_data_targets - val_predictions_manul))
val_rmse_manul = np.sqrt(np.mean(np.square(val_data_targets - val_predictions_manul)))

# Calculate MAE and RMSE for test set
test_mae_manul = np.mean(np.abs(test_data_targets - test_predictions_manul))
test_rmse_manul = np.sqrt(np.mean(np.square(test_data_targets - test_predictions_manul)))

print('\n\nManual Transformer:\n-------------------')
print(f'Validation MAE: {val_mae_manul}')
print(f'Validation RMSE: {val_rmse_manul}')
print(f'\nTest MAE: {test_mae_manul}')
print(f'Test RMSE: {test_rmse_manul}')
print('\n==============================')



Manual Transformer:
-------------------
Validation MAE: 0.1547253578901291
Validation RMSE: 0.2175816148519516

Test MAE: 0.17318914630968676
Test RMSE: 0.183777361300491

