In [1]:
# Data manipulation, refer: https://cienciadedatos.net/documentos/py29-forecasting-electricity-power-demand-python
# ==============================================================================
import numpy as np
import pandas as pd
# from astral.sun import sun
# from astral import LocationInfo
from skforecast.datasets import fetch_dataset

# Plots
# ==============================================================================
import matplotlib.pyplot as plt





## This notebook use "Demand" and "Temperature" 2 features for prediction.
- We DO NOT rescale/normalize based on training dataset
- The model is transformer take in to (batch_zie, sequence, feat=2) as input and output predicts (batch_zie, sequence, feat=2)
- We use "sequence" length of data to predict "one" data point after the sequence. 

In [2]:
data = fetch_dataset(name='vic_electricity', raw=True)
data.info()

vic_electricity
---------------
Half-hourly electricity demand for Victoria, Australia
O'Hara-Wild M, Hyndman R, Wang E, Godahewa R (2022).tsibbledata: Diverse
Datasets for 'tsibble'. https://tsibbledata.tidyverts.org/,
https://github.com/tidyverts/tsibbledata/.
https://tsibbledata.tidyverts.org/reference/vic_elec.html
Shape of the dataset: (52608, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52608 entries, 0 to 52607
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Time         52608 non-null  object 
 1   Demand       52608 non-null  float64
 2   Temperature  52608 non-null  float64
 3   Date         52608 non-null  object 
 4   Holiday      52608 non-null  bool   
dtypes: bool(1), float64(2), object(2)
memory usage: 1.7+ MB


In [3]:
# Data preparation
# ==============================================================================
data = data.copy()
data['Time'] = pd.to_datetime(data['Time'], format='%Y-%m-%dT%H:%M:%SZ')
data = data.set_index('Time') # use time as index
data = data.asfreq('30min')
data = data.sort_index() # sort by time order
data.head(4)

Unnamed: 0_level_0,Demand,Temperature,Date,Holiday
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-12-31 13:00:00,4382.825174,21.4,2012-01-01,True
2011-12-31 13:30:00,4263.365526,21.05,2012-01-01,True
2011-12-31 14:00:00,4048.966046,20.7,2012-01-01,True
2011-12-31 14:30:00,3877.56333,20.55,2012-01-01,True


In [4]:
# Aggregating in 1H intervals
# ==============================================================================
# The Date column is eliminated so that it does not generate an error when aggregating.

# also, in this version we drop "Holiday"
data = data.drop(columns="Date")
data = data.drop(columns="Holiday")
data = (
    data
    .resample(rule="h", closed="left", label="right")
    .agg({
        "Demand": "mean",
        "Temperature": "mean",
        # "Holiday": "mean",
    })
)
data.head(4)

Unnamed: 0_level_0,Demand,Temperature
Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-12-31 14:00:00,4323.09535,21.225
2011-12-31 15:00:00,3963.264688,20.625
2011-12-31 16:00:00,3950.913495,20.325
2011-12-31 17:00:00,3627.860675,19.85


In [5]:
data.tail()

Unnamed: 0_level_0,Demand,Temperature
Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-12-31 09:00:00,4069.62555,21.6
2014-12-31 10:00:00,3909.230704,20.3
2014-12-31 11:00:00,3900.600901,19.65
2014-12-31 12:00:00,3758.236494,18.1
2014-12-31 13:00:00,3785.65072,17.2


In [6]:
index = pd.date_range('1/1/2000', periods=9, freq='min')
series = pd.Series(range(9), index=index)
df = series.to_frame()
df

Unnamed: 0,0
2000-01-01 00:00:00,0
2000-01-01 00:01:00,1
2000-01-01 00:02:00,2
2000-01-01 00:03:00,3
2000-01-01 00:04:00,4
2000-01-01 00:05:00,5
2000-01-01 00:06:00,6
2000-01-01 00:07:00,7
2000-01-01 00:08:00,8


In [7]:
# Split data into train-val-test set
# Note that "loc" function includes both the start and the stop are included
end_train = '2013-12-31 23:59:00'
start_val = '2014-01-01 00:00:00'
end_validation = '2014-11-30 23:59:00'
start_test = '2014-12-01 00:00:00'


data_train = data.loc[: end_train, :].copy()
data_val   = data.loc[start_val:end_validation, :].copy()
data_test  = data.loc[start_test: , :].copy()

print(f"Train dates      : {data_train.index.min()} --- {data_train.index.max()}  (n={len(data_train)})")
print(f"Validation dates : {data_val.index.min()} --- {data_val.index.max()}  (n={len(data_val)})")
print(f"Test dates       : {data_test.index.min()} --- {data_test.index.max()}  (n={len(data_test)})")



Train dates      : 2011-12-31 14:00:00 --- 2013-12-31 23:00:00  (n=17554)
Validation dates : 2014-01-01 00:00:00 --- 2014-11-30 23:00:00  (n=8016)
Test dates       : 2014-12-01 00:00:00 --- 2014-12-31 13:00:00  (n=734)


## Normalize data frame based on training data
- You need to normalize the test data using the parameters of the training data.

- refer to https://www.reddit.com/r/econometrics/comments/1547hl5/how_to_deal_with_normalization_in_time_series/

In [8]:
# ## Normalize based on training data for all datasets

# # refer: https://stackoverflow.com/questions/26414913/normalize-columns-of-a-dataframe

# data_train_scaled = (data_train-data_train.mean())/data_train.std()
# data_val_scaled = (data_val-data_train.mean())/data_train.std()
# data_test_scaled = (data_test-data_train.mean())/data_train.std()


## Prepare sequence dataloader for training
 - refer to https://github.com/jeffheaton/app_deep_learning/blob/06ea8bdb9cb18151d3ada51e1fa580690a8245fe//t81_558_class_10_3_transformer_timeseries.ipynb

In [9]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau


# Sequence Data Preparation- use seq_size # of data to predict one data ("window" variable) afterward ("after_window" variable)
SEQUENCE_SIZE = 10
FEAT_DIM = 2 # pick "Demand", "Temperature"--> 2 features
BATCH_SIZE = 100

def to_sequences(seq_size, obs):
    x = []
    y = []
    obs_np = obs.to_numpy()
    feat_dim = obs_np.shape[1]
    for i in range(len(obs_np) - seq_size):
        window = obs_np[i:(i + seq_size),:]
        after_window = obs_np[i + seq_size,:]
        x.append(window)
        y.append(after_window)
    return torch.tensor(x, dtype=torch.float32).view(-1, seq_size, feat_dim), torch.tensor(y, dtype=torch.float32).view(-1, feat_dim)

# x_train, y_train = to_sequences(SEQUENCE_SIZE, data_train_scaled)
# x_val, y_val = to_sequences(SEQUENCE_SIZE, data_val_scaled)
# x_test, y_test = to_sequences(SEQUENCE_SIZE, data_test_scaled)
x_train, y_train = to_sequences(SEQUENCE_SIZE, data_train)
x_val, y_val = to_sequences(SEQUENCE_SIZE, data_val)
x_test, y_test = to_sequences(SEQUENCE_SIZE, data_test)

# Setup data loaders for batch
train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(x_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

test_dataset = TensorDataset(x_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

  return torch.tensor(x, dtype=torch.float32).view(-1, seq_size, feat_dim), torch.tensor(y, dtype=torch.float32).view(-1, feat_dim)


## Use Transformer Model for Training
- refer to https://github.com/jeffheaton/app_deep_learning/blob/06ea8bdb9cb18151d3ada51e1fa580690a8245fe//t81_558_class_10_3_transformer_timeseries.ipynb

In [10]:
# Positional Encoding for Transformer (only works for even number of feature dimension)
# so best practice is to first map your row data into even number of feature dim using nn.linear, and then apply (transformer + position enc) from there
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)
    
    
'''
Simple tutorial about position encoding are:
# refer to https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/
# refer to https://discuss.pytorch.org/t/transformer-example-position-encoding-function-works-only-for-even-d-model/100986/2
'''

'\nSimple tutorial about position encoding are:\n# refer to https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/\n# refer to https://discuss.pytorch.org/t/transformer-example-position-encoding-function-works-only-for-even-d-model/100986/2\n'

In [11]:
# Model definition using Transformer
class TransformerModel(nn.Module):
    def __init__(self, input_dim=2, d_model=64, nhead=4, num_layers=2, dropout=0.2, pred_feat_dim=2):
        '''
        input_dim: its your time sequence data with selected number of feature used (we picked "Demand", "Temperature", "Holiday" 3 features)
        '''
        super(TransformerModel, self).__init__()

        self.encoder = nn.Linear(input_dim, d_model) # as mention, first use linear layer to map raw data into even number of feature dim (aka, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(d_model, pred_feat_dim)

    def forward(self, x):
        x = self.encoder(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = self.decoder(x[:, -1, :])
        return x

device = "cuda" if torch.cuda.is_available() else "cpu"
model = TransformerModel(input_dim=FEAT_DIM, pred_feat_dim=FEAT_DIM).to(device)



In [12]:
# some trainning helper function:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [19]:
# from torch.utils.tensorboard import SummaryWriter



# writer = SummaryWriter('./tensorboard/data_2feat/lr1e-3')

# Train the model
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=3, verbose=True)

epochs = 10 #5000
early_stop_count = 0
min_val_loss = float('inf')
record = {'train_loss': AverageMeter('train_loss', ':.4f'),
          'val_loss': AverageMeter('val_loss',':.4f')}

for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        x_batch, y_batch = batch
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        scheduler.step(loss)
        record['train_loss'].update(loss.detach().item(), y_batch.size(0))
    
    
    # end of epoch do
    # writer.add_scalar('train_loss', record['train_loss'].avg, epoch)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {record['train_loss'].avg:.4f}")

    # Validation
    model.eval()
    val_losses = []
    with torch.no_grad():
        for batch in test_loader:
            x_batch, y_batch = batch
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            outputs = model(x_batch)
            val_loss = criterion(outputs, y_batch)
            val_losses.append(loss.item())
            record['val_loss'].update(val_loss.detach().item(), y_batch.size(0))
        
        # writer.add_scalar('val_loss', record['val_loss'].avg, epoch)
        print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {record['val_loss'].avg:.4f}")



    # if val_loss < min_val_loss:
    #     min_val_loss = val_loss
    #     early_stop_count = 0
    # else:
    #     early_stop_count += 1

    # if early_stop_count >= 5:
    #     print("Early stopping!")
    #     break
    # print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss:.4f}")

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Epoch 1/10, Training Loss: 11332094.7499
Epoch 1/10, Validation Loss: 9408950.3453
Epoch 2/10, Training Loss: 11330953.3023
Epoch 2/10, Validation Loss: 9408950.0608
Epoch 3/10, Training Loss: 11330570.1482
Epoch 3/10, Validation Loss: 9408949.3048
Epoch 4/10, Training Loss: 11330381.2558
Epoch 4/10, Validation Loss: 9408948.9227
Epoch 5/10, Training Loss: 11330262.9415
Epoch 5/10, Validation Loss: 9408948.3243
Epoch 6/10, Training Loss: 11330185.9507
Epoch 6/10, Validation Loss: 9408947.8651
Epoch 7/10, Training Loss: 11330130.0887
Epoch 7/10, Validation Loss: 9408947.2833
Epoch 8/10, Training Loss: 11330088.7207
Epoch 8/10, Validation Loss: 9408946.8104
Epoch 9/10, Training Loss: 11330056.0728
Epoch 9/10, Validation Loss: 9408946.2624
Epoch 10/10, Training Loss: 11330030.9820
Epoch 10/10, Validation Loss: 9408945.7724
