<a href="https://colab.research.google.com/github/PraveshKoirala/Transformers-Project/blob/main/transformer_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
!pip install --upgrade torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Configuration parameters

In [27]:
%%file config.py
# Max number of tokens
dmodel = 512
dim_val = dmodel  # This can be any value divisible by n_heads. 512 is used in the original transformer paper.
target_seq_len = 1  # Length of the target sequence, i.e. how many time steps should your forecast cover
n_encoder_layers = 4  # Number of times the encoder layer is stacked in the encoder
n_decoder_layers = 4  # Number of times the decoder layer is stacked in the decoder
n_heads = 8  # The number of attention heads (aka parallel attention layers). dim_val must be divisible by this number
batch_size = 512

enc_seq_len = 5  # length of input given to encoder. Can have any integer value.
dec_seq_len = enc_seq_len-1  # length of input given to decoder. Can have any integer value.
max_seq_len = enc_seq_len  # What's the longest sequence the model will encounter? Used to make the positional encoder
epochs = 20
lr = 0.001
weight_decay = 0.0001
ratio = 0.8
DEVICE = "cuda"
TIME_EMBEDDING = 32
SEGMENT_EMBEDDING = 128
DAY_EMBEDDING = 7
NUM_DAY = 7
NUM_SEGMENTS = 50
NUM_TIME=96
NUM_CONTINUOUS=8
dim_continuous = dmodel - TIME_EMBEDDING - SEGMENT_EMBEDDING - DAY_EMBEDDING

Overwriting config.py


# Datasets and Dataloader stuff

In [28]:
from google.colab import drive
drive.mount('/content/drive')
drive_prefix="/content/drive/MyDrive/Transformers/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
%%capture
!pip install --upgrade pandas

In [30]:
%%file dataset.py
import torch
from torch.utils.data import Dataset
import pandas as pd
import config
import numpy as np

class MyDataset(Dataset):
    def __init__(self, data, buckets):
        self.buckets=buckets
        self.data = data

    def __getitem__(self, item):
        cols = self.data.iloc[self.buckets[item]]
        return  torch.tensor(cols[['time_window', 'dayofweek', 'segment_id_int',
                                   'is_holiday', 'is_school_break', 
                                   'travel_time', 'darksky_temperature',
                                   'darksky_humidity', 
                                   'darksky_precipitation_probability',
                                   'traffic_speed', 'distance_m']].astype(np.float).values), \
                torch.tensor(cols[['delay_time']].iloc[:config.max_seq_len-1].astype(np.float).values), \
                torch.tensor(cols[['delay_time']].iloc[config.max_seq_len-1].astype(np.float).values)

    def __len__(self):
        return len(self.buckets)

Overwriting dataset.py


In [31]:
%%file metrics.py
import numpy as np

def get_mape(x, y):
    return np.mean(np.abs((x-y)/x)) * 100


def get_rmse(x, y):
    return np.sqrt(np.mean(np.square(x - y)))

Overwriting metrics.py


In [32]:
%%file penc.py
import torch
import math
from torch import nn, Tensor


class PositionalEncoder(nn.Module):
    """
    Adapted from:
    https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    https://github.com/LiamMaclean216/Pytorch-Transfomer/blob/master/utils.py
    """

    def __init__(self, dropout: float = 0.1, max_seq_len: int = 5000, d_model: int = 512):
        """
        Args:
            dropout: the dropout rate
            max_seq_len: the maximum length of the input sequences
            d_model: The dimension of the output of sub-layers in the model
                     (Vaswani et al, 2017)
        """

        super().__init__()

        self.d_model = d_model

        self.dropout = nn.Dropout(p=dropout)

        # Create constant positional encoding matrix with values
        # dependent on position and i
        position = torch.arange(max_seq_len).unsqueeze(1)

        exp_input = torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)

        div_term = torch.exp(exp_input)  # Returns a new tensor with the exponential of the elements of exp_input

        pe = torch.zeros(max_seq_len, d_model)

        pe[:, 0::2] = torch.sin(position * div_term)

        pe[:, 1::2] = torch.cos(position * div_term)  # torch.Size([target_seq_len, dim_val])

        pe = pe.unsqueeze(0).transpose(0, 1)  # torch.Size([target_seq_len, input_size, dim_val])

        # register that pe is not a model parameter
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [batch_size, enc_seq_len, dim_val]
        """

        add = self.pe[:x.size(1), :].squeeze(1)

        x = x + add

        return self.dropout(x)

Overwriting penc.py


In [33]:
%%file utils.py
import torch


def generate_square_subsequent_mask(dim1: int, dim2: int, dim3: int) -> torch.Tensor:
    """
    Generates an upper-triangular matrix of -inf, with zeros on diag.
    Modified from:
    https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    Args:
        dim1: int, batch_size * n_heads
        dim2: int. For src and trg masking this must be target sequence length.
        dim3: int. For src masking, this must be encoder sequence length.
              For trg masking, this must be target sequence length
    Return:
        A Tensor of shape [dim1, dim2, dim3]
    """
    return torch.triu(torch.ones(dim1, dim2, dim3) * float('-inf'), diagonal=1)

Overwriting utils.py


In [34]:
%%file time_series_transformer.py
import torch
from torch import nn, Tensor

from penc import PositionalEncoder
from config import *

class TimeSeriesTransformer(nn.Module):
    """
    This class implements a transformer model that can be used for times series
    forecasting. This time series transformer model is based on the paper by
    Wu et al (2020) [1]. The paper will be referred to as "the paper".
    A detailed description of the code can be found in my article here:
    https://towardsdatascience.com/how-to-make-a-pytorch-transformer-for-time-series-forecasting-69e073d4061e
    In cases where the paper does not specify what value was used for a specific
    configuration/hyperparameter, this class uses the values from Vaswani et al
    (2017) [2] or from PyTorch source code.
    Unlike the paper, this class assumes that input layers, positional encoding
    layers and linear mapping layers are separate from the encoder and decoder,
    i.e. the encoder and decoder only do what is depicted as their sub-layers
    in the paper. For practical purposes, this assumption does not make a
    difference - it merely means that the linear and positional encoding layers
    are implemented inside the present class and not inside the
    Encoder() and Decoder() classes.
    [1] Wu, N., Green, B., Ben, X., O'banion, S. (2020).
    'Deep Transformer Models for Time Series Forecasting:
    The Influenza Prevalence Case'.
    arXiv:2001.08317 [cs, stat] [Preprint].
    Available at: http://arxiv.org/abs/2001.08317 (Accessed: 9 March 2022).
    [2] Vaswani, A. et al. (2017)
    'Attention Is All You Need'.
    arXiv:1706.03762 [cs] [Preprint].
    Available at: http://arxiv.org/abs/1706.03762 (Accessed: 9 March 2022).
    """

    def __init__(self,
                 # input_size: int,
                 dec_seq_len: int,
                 max_seq_len: int,
                 out_seq_len: int,
                 dim_val: int,
                 n_encoder_layers: int = 4,
                 n_decoder_layers: int = 4,
                 n_heads: int = 4,
                 dropout_encoder: float = 0.2,
                 dropout_decoder: float = 0.2,
                 dropout_pos_enc: float = 0.2,
                 dim_feedforward_encoder: int = 2048,
                 dim_feedforward_decoder: int = 2048,
                 ):
        """
        Args:
            input_size: int, number of input variables. 1 if univariate.
            dec_seq_len: int, the length of the input sequence fed to the decoder
            max_seq_len: int, length of the longest sequence the model will
                         receive. Used in positional encoding.
            out_seq_len: int, the length of the model's output (i.e. the target
                         sequence length)
            dim_val: int, aka d_model. All sub-layers in the model produce
                     outputs of dimension dim_val
            n_encoder_layers: int, number of stacked encoder layers in the encoder
            n_decoder_layers: int, number of stacked encoder layers in the decoder
            n_heads: int, the number of attention heads (aka parallel attention layers)
            dropout_encoder: float, the dropout rate of the encoder
            dropout_decoder: float, the dropout rate of the decoder
            dropout_pos_enc: float, the dropout rate of the positional encoder
            dim_feedforward_encoder: int, number of neurons in the linear layer
                                     of the encoder
            dim_feedforward_decoder: int, number of neurons in the linear layer
                                     of the decoder
        """

        super().__init__()

        self.dec_seq_len = dec_seq_len

        # The time ranges from 0 to 23 indicating the hour of the day
        self.time_embedding = nn.Embedding(num_embeddings=NUM_TIME, embedding_dim=TIME_EMBEDDING)
        # the road segments ranges from 0 to ... specifying the road segments
        self.segment_embedding = nn.Embedding(num_embeddings=NUM_SEGMENTS, embedding_dim=SEGMENT_EMBEDDING)
        self.day_embedding = nn.Embedding(num_embeddings=NUM_DAY, embedding_dim=DAY_EMBEDDING)

        # 1. create 'linear input layer' for 'encoder'
        self.encoder_input_layer = nn.Linear(in_features=NUM_CONTINUOUS, out_features=dim_continuous)

        # 2. create positional encoder
        self.positional_encoding_layer = PositionalEncoder(d_model=dim_val,
                                                           dropout=dropout_pos_enc,
                                                           max_seq_len=max_seq_len)

        # 3. create encoder layers using nn.TransformerDecoder
        # The encoder layer used in the paper is identical to the one used by
        # Vaswani et al (2017) on which the PyTorch module is based.
        encoder_layer = nn.TransformerEncoderLayer(d_model=dim_val,
                                                   nhead=n_heads,
                                                   dim_feedforward=dim_feedforward_encoder,
                                                   dropout=dropout_encoder,
                                                   batch_first=True
                                                   )

        # It seems the option of passing a normalization instance is redundant
        # in my case, because nn.TransformerEncoderLayer per default normalizes
        # after each sub-layer (https://github.com/pytorch/pytorch/issues/24930).
        self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=n_encoder_layers, norm=None)


        # 4. create 'linear input layer' for decoder
        self.decoder_input_layer = nn.Linear(in_features=1, out_features=dim_val)

        # 5. create decoder layers using nn.TransformerDecoder
        decoder_layer = nn.TransformerDecoderLayer(d_model=dim_val,
                                                   nhead=n_heads,
                                                   dim_feedforward=dim_feedforward_decoder,
                                                   dropout=dropout_decoder,
                                                   batch_first=True
                                                   )
        self.decoder = nn.TransformerDecoder(decoder_layer=decoder_layer, num_layers=n_decoder_layers, norm=None)

        # 6. create 'linear mapping layer'
        self.linear_mapping = nn.Linear(in_features=dim_val, out_features=out_seq_len)

    def forward(self, src: Tensor, tgt: Tensor, src_mask: Tensor = None, tgt_mask: Tensor = None) -> Tensor:
        """
        Args:
            src: the encoder's output sequence. Shape: (S,E) for unbatched input,
                 (S, N, E) if batch_first=False or (N, S, E) if
                 batch_first=True, where S is the source sequence length,
                 N is the batch size, and E is the feature number
            tgt: the sequence to the decoder. Shape: (T,E) for unbatched input,
                 (T, N, E)(T,N,E) if batch_first=False or (N, T, E) if
                 batch_first=True, where T is the target sequence length,
                 N is the batch size, E is the feature number.
            src_mask: the mask for the src sequence to prevent the model from
                      using data points from the target sequence
            tgt_mask: the mask for the tgt sequence to prevent the model from
                      using data points from the target sequence
        """
        time_embedding = self.time_embedding(src[:,:,0].int())
        day_embedding = self.day_embedding(src[:, :, 1].int())
        segment_embedding = self.segment_embedding(src[:, :, 2].int())

        src = self.encoder_input_layer(src[:,:,3:].float())
        # concatenate them
        src = torch.cat([time_embedding, day_embedding, segment_embedding, src], dim=2)
        # add positional
        src = self.positional_encoding_layer(src)

        # Pass through all the stacked encoder layers in the encoder
        # Masking is only needed in the encoder if input sequences are padded
        # which they are not in this time series use case, because all my
        # input sequences are naturally of the same length.
        # (https://github.com/huggingface/transformers/issues/4083)
        src = self.encoder(src=src)
        decoder_output = self.decoder_input_layer(tgt.float())
        tgt_mask = None # we are trying to keep it as simple as possible.
        decoder_output = self.decoder(tgt=decoder_output.float(), memory=src, tgt_mask=tgt_mask, memory_mask=src_mask)
        # print (decoder_output.shape)

        decoder_output = self.linear_mapping(decoder_output)
        # print(decoder_output.shape)

        return decoder_output

Overwriting time_series_transformer.py


In [35]:
%%file train.py
import torch
from torch import nn
from torch.utils.data import DataLoader
from itertools import chain
from config import *
import numpy as np
from time_series_transformer import TimeSeriesTransformer
from utils import generate_square_subsequent_mask
from tqdm import tqdm

def train(data):
    # src_trg_trgy = []
    # for i in range(((data.shape[0] - enc_seq_len) // batch_size) * batch_size):
    #     src = data[i:i + enc_seq_len]
    #     trg = data[i + enc_seq_len]
    #     trg_y = data[i + enc_seq_len - 1 + dec_seq_len: i + enc_seq_len - 1 + dec_seq_len + target_seq_len]
    #
    #     src = torch.FloatTensor(np.asarray(src))
    #     trg = torch.FloatTensor(np.asarray(trg))
    #     trg_y = torch.FloatTensor(np.asarray(trg_y))
    #
    #     src_trg_trgy.append((src, trg, trg_y))

    # src_trg_trgy = np.array(src_trg_trgy)

    # from sklearn.model_selection import ShuffleSplit
    #
    # sss = ShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    # train_index, test_index = next(sss.split(src_trg_trgy))
    #
    # train_set = src_trg_trgy[train_index]
    # test_set = src_trg_trgy[test_index]
    #
    # train_len = train_set.shape[0]
    # test_len = test_set.shape[0]

    # train = MyDataset([(src.to(device), trg.to(device), trg_y.to(device)) for src, trg, trg_y in train_set.tolist()])
    # test = MyDataset([(src.to(device), trg.to(device), trg_y.to(device)) for src, trg, trg_y in test_set.tolist()])

    train_set = DataLoader(dataset=data, batch_size=batch_size, shuffle=False, num_workers=0)
    # test_set = DataLoader(dataset=test, batch_size=batch_size, shuffle=False, num_workers=0)

    # Make tgt mask for decoder with size:
    # [batch_size*n_heads, target_seq_len, target_seq_len]
    src_mask = None
    tgt_mask = generate_square_subsequent_mask(dim1=n_heads * batch_size, dim2=dec_seq_len, dim3=dec_seq_len).to(DEVICE)

    # -----------------------------------build model---------------------------------------------------
    model = TimeSeriesTransformer(
        dim_val=dim_val,
        # input_size=input_size,
        dec_seq_len=dec_seq_len,
        max_seq_len=max_seq_len,
        out_seq_len=target_seq_len,
        n_decoder_layers=n_decoder_layers,
        n_encoder_layers=n_encoder_layers,
        n_heads=n_heads
    )
    model.to(DEVICE)
    loss_function = nn.MSELoss().to(DEVICE)

    # define optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    path = "/content/drive/MyDrive/checkpoint.pt"
    path_opt = "/content/drive/MyDrive/checkpoint_opt.pt"
    # -----------------------------------training---------------------------------------------------
    loss = 0
    best_val_loss = float("inf")
    best_model = None
    best_optimizer = None
    for i in range(epochs):
        for (src, trg, trg_y) in tqdm(train_set):
            if src.shape[0] % batch_size != 0: continue  # skip last entry
            # The shape of src is (batchsize, seq_length, ndim)
            # src = torch.reshape(src, (batch_size, enc_seq_len, 1))
            # trg = torch.reshape(trg, (batch_size, dec_seq_len, 1))
            src = src.to(DEVICE)
            trg = trg.to(DEVICE)
            trg_y = trg_y.to(DEVICE)
            y_pred = model(src=src, tgt=trg, src_mask=src_mask, tgt_mask=tgt_mask)
            loss = loss_function(y_pred[:, -1, :].reshape(batch_size, 1), trg_y.float()).float()
            
            if torch.isnan(loss):
              print("Loss function returned NaN, exiting")
              print()
              print (trg_y, y_pred)
              print(loss.item())
              return
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        torch.save({
            'epoch': i,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, path)
        if loss < best_val_loss:
            best_val_loss = loss
            best_model = model
            best_optimizer = optimizer
            torch.save({
            'epoch': i,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, path_opt)
        print('epoch:', i, '\tMSE:', loss.item())
    print("The best loss achieved was:", best_val_loss)
    return None, None, best_model
    model = best_model
    model.eval()

    pred = []
    y = []
    for (src, trg, trg_y) in test_set:
        if src.shape[0] % batch_size != 0: continue  # skip last entry
        trg_y = list(chain.from_iterable(trg_y.data.tolist()))
        y.extend(trg_y)
        src = torch.reshape(src, (batch_size, enc_seq_len, 1))
        trg = torch.reshape(trg, (batch_size, dec_seq_len, 1))
        # src = torch.reshape(src, (dec_seq_len, 1))
        src = src.to(device)
        with torch.no_grad():
            y_pred = model(src=src, tgt=trg, src_mask=src_mask, tgt_mask=tgt_mask)
            y_pred = y_pred[:, -1, :].reshape(batch_size, 1).tolist()
            pred.extend(y_pred)
    # test forecast results
    test_results = np.array(pred)
    test_real = np.array(y)
    test_MAPE = get_mape(test_real, test_results)
    test_RMSE = get_rmse(test_real, test_results)
    return test_MAPE, test_RMSE, model

Overwriting train.py


In [36]:
import torch
import numpy as np
from importlib import reload
from sklearn.model_selection import train_test_split

# load data
import pandas as pd
import train
from dataset import MyDataset
df=pd.read_pickle(drive_prefix+'overall_dataframe2.pkl')
print(df.shape)
df=df.drop(index=df[df.day_id<700].index) # only work on data from 2022 (ish)
print(df.shape, "After dropping data from 2020 and 2021")
df.sort_values(['day_id', 'segment_id_int', 'time_window'], inplace=True)

(11555002, 13)
(2617752, 13) After dropping data from 2020 and 2021


In [37]:
top_50_segments = df.groupby(['day_id', 'segment_id_int'])['time_window'].count()


In [38]:
top_50_segments = top_50_segments.groupby('segment_id_int').sum().sort_values(ascending=False).head(50)
top_50_segments.name="time_window_count"

In [39]:
df.shape

(2617752, 13)

In [40]:
df = df.merge(top_50_segments, left_on="segment_id_int", right_on="segment_id_int").drop(columns=["time_window_count"])

In [41]:
df.segment_id_int = df.segment_id_int.factorize()[0]

In [42]:
df.day_id=df.day_id.factorize()[0]

In [43]:
df.dayofweek=df.dayofweek.factorize()[0]

In [44]:
df.dayofweek.factorize()

(array([0, 0, 0, ..., 1, 1, 1]),
 Int64Index([0, 1, 2, 3, 4, 5, 6], dtype='int64'))

In [45]:
df = df.dropna()  # no idea where 1 na crept up from

In [46]:
counts = df.groupby(['day_id', 'segment_id_int'])['time_window'].count()

buckets = {}
i = 0
j = 0
for c in counts:
  for _ in range(c-5+1):
    buckets[j]=range(i,i+5)
    i+=1
    j+=1
  i+=(5-1)

In [47]:
import os
os.environ['CUDA_LAUNCH_BLOCKING']='1'
reload(train)

dataset = MyDataset(df, buckets)
train_df, test_df = torch.utils.data.random_split(dataset, (0.8, 0.2))

# train the model. All other configurations are passed in config.py
model = train.train(train_df)

100%|██████████| 389/389 [08:20<00:00,  1.29s/it]


epoch: 0 	MSE: 0.7290946245193481


100%|██████████| 389/389 [08:22<00:00,  1.29s/it]


epoch: 1 	MSE: 0.7284085750579834


100%|██████████| 389/389 [08:16<00:00,  1.28s/it]


epoch: 2 	MSE: 0.7361782789230347


100%|██████████| 389/389 [08:08<00:00,  1.26s/it]


epoch: 3 	MSE: 0.7321938276290894


100%|██████████| 389/389 [08:03<00:00,  1.24s/it]


epoch: 4 	MSE: 0.7329614162445068


100%|██████████| 389/389 [08:06<00:00,  1.25s/it]


epoch: 5 	MSE: 0.7294313907623291


100%|██████████| 389/389 [08:12<00:00,  1.27s/it]


epoch: 6 	MSE: 0.7307695150375366


100%|██████████| 389/389 [08:17<00:00,  1.28s/it]


epoch: 7 	MSE: 0.7311733365058899


 15%|█▌        | 60/389 [01:20<07:22,  1.35s/it]


KeyboardInterrupt: ignored