<a href="https://colab.research.google.com/github/PraveshKoirala/Transformers-Project/blob/main/transformer_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch
  Downloading torch-1.13.0-cp38-cp38-manylinux1_x86_64.whl (890.2 MB)
[K     |██████████████████████████████  | 834.1 MB 103.2 MB/s eta 0:00:01tcmalloc: large alloc 1147494400 bytes == 0x65ab2000 @  0x7f8014070615 0x5d631c 0x51e4f1 0x51e67b 0x4f7585 0x49ca7c 0x4fdff5 0x49caa1 0x4fdff5 0x49ced5 0x4f60a9 0x55f926 0x4f60a9 0x55f926 0x4f60a9 0x55f926 0x5d7c18 0x5d9412 0x586636 0x5d813c 0x55f3fd 0x55e571 0x5d7cf1 0x49ced5 0x55e571 0x5d7cf1 0x49ec69 0x5d7c18 0x49ca7c 0x4fdff5 0x49ced5
[K     |████████████████████████████████| 890.2 MB 5.6 kB/s 
Collecting nvidia-cuda-runtime-cu11==11.7.99
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[K     |████████████████████████████████| 849 kB 52.9 MB/s 
[?25hCollecting nvidia-cublas-cu11==11.10.3.66
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)
[K     |█

# Configuration parameters

In [2]:
%%file config.py
# Max number of tokens
dmodel = 512
dim_val = dmodel  # This can be any value divisible by n_heads. 512 is used in the original transformer paper.
target_seq_len = 1  # Length of the target sequence, i.e. how many time steps should your forecast cover
n_encoder_layers = 4  # Number of times the encoder layer is stacked in the encoder
n_decoder_layers = 4  # Number of times the decoder layer is stacked in the decoder
n_heads = 8  # The number of attention heads (aka parallel attention layers). dim_val must be divisible by this number
batch_size = 512

enc_seq_len = 5  # length of input given to encoder. Can have any integer value.
dec_seq_len = enc_seq_len-1  # length of input given to decoder. Can have any integer value.
max_seq_len = enc_seq_len  # What's the longest sequence the model will encounter? Used to make the positional encoder
epochs = 20
lr = 0.001
weight_decay = 0.0001
ratio = 0.8
DEVICE = "cuda"
TIME_EMBEDDING = 32
SEGMENT_EMBEDDING = 128
DAY_EMBEDDING = 7
NUM_DAY = 7
NUM_SEGMENTS = 50
NUM_TIME=96
NUM_CONTINUOUS=8
dim_continuous = dmodel - TIME_EMBEDDING - SEGMENT_EMBEDDING - DAY_EMBEDDING

Writing config.py


# Datasets and Dataloader stuff

In [3]:
from google.colab import drive
drive.mount('/content/drive')
drive_prefix="/content/drive/MyDrive/Transformers/"

Mounted at /content/drive


In [4]:
%%capture
!pip install --upgrade pandas

In [5]:
%%file dataset.py
import torch
from torch.utils.data import Dataset
import pandas as pd
import config
import numpy as np

class MyDataset(Dataset):
    def __init__(self, data, buckets):
        self.buckets=buckets
        self.data = data

    def __getitem__(self, item):
        cols = self.data.iloc[self.buckets[item]]
        return  torch.tensor(cols[['time_window', 'dayofweek', 'segment_id_int',
                                   'is_holiday', 'is_school_break', 
                                   'travel_time', 'darksky_temperature',
                                   'darksky_humidity', 
                                   'darksky_precipitation_probability',
                                   'traffic_speed', 'distance_m']].astype(np.float).values), \
                torch.tensor(cols[['delay_time']].iloc[:config.max_seq_len-1].astype(np.float).values), \
                torch.tensor(cols[['delay_time']].iloc[config.max_seq_len-1].astype(np.float).values)

    def __len__(self):
        return len(self.buckets)

Writing dataset.py


In [6]:
%%file metrics.py
import numpy as np

def get_mape(x, y):
    return np.mean(np.abs((x-y)/x)) * 100


def get_rmse(x, y):
    return np.sqrt(np.mean(np.square(x - y)))

Writing metrics.py


In [7]:
%%file penc.py
import torch
import math
from torch import nn, Tensor


class PositionalEncoder(nn.Module):
    """
    Adapted from:
    https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    https://github.com/LiamMaclean216/Pytorch-Transfomer/blob/master/utils.py
    """

    def __init__(self, dropout: float = 0.1, max_seq_len: int = 5000, d_model: int = 512):
        """
        Args:
            dropout: the dropout rate
            max_seq_len: the maximum length of the input sequences
            d_model: The dimension of the output of sub-layers in the model
                     (Vaswani et al, 2017)
        """

        super().__init__()

        self.d_model = d_model

        self.dropout = nn.Dropout(p=dropout)

        # Create constant positional encoding matrix with values
        # dependent on position and i
        position = torch.arange(max_seq_len).unsqueeze(1)

        exp_input = torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)

        div_term = torch.exp(exp_input)  # Returns a new tensor with the exponential of the elements of exp_input

        pe = torch.zeros(max_seq_len, d_model)

        pe[:, 0::2] = torch.sin(position * div_term)

        pe[:, 1::2] = torch.cos(position * div_term)  # torch.Size([target_seq_len, dim_val])

        pe = pe.unsqueeze(0).transpose(0, 1)  # torch.Size([target_seq_len, input_size, dim_val])

        # register that pe is not a model parameter
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [batch_size, enc_seq_len, dim_val]
        """

        add = self.pe[:x.size(1), :].squeeze(1)

        x = x + add

        return self.dropout(x)

Writing penc.py


In [9]:
%%file time_series_transformer.py
import torch
from torch import nn, Tensor

from penc import PositionalEncoder
from config import *

class TimeSeriesTransformer(nn.Module):
    """
    This class implements a transformer model that can be used for times series
    forecasting. This time series transformer model is based on the paper by
    Wu et al (2020) [1]. The paper will be referred to as "the paper".
    A detailed description of the code can be found in my article here:
    https://towardsdatascience.com/how-to-make-a-pytorch-transformer-for-time-series-forecasting-69e073d4061e
    In cases where the paper does not specify what value was used for a specific
    configuration/hyperparameter, this class uses the values from Vaswani et al
    (2017) [2] or from PyTorch source code.
    Unlike the paper, this class assumes that input layers, positional encoding
    layers and linear mapping layers are separate from the encoder and decoder,
    i.e. the encoder and decoder only do what is depicted as their sub-layers
    in the paper. For practical purposes, this assumption does not make a
    difference - it merely means that the linear and positional encoding layers
    are implemented inside the present class and not inside the
    Encoder() and Decoder() classes.
    [1] Wu, N., Green, B., Ben, X., O'banion, S. (2020).
    'Deep Transformer Models for Time Series Forecasting:
    The Influenza Prevalence Case'.
    arXiv:2001.08317 [cs, stat] [Preprint].
    Available at: http://arxiv.org/abs/2001.08317 (Accessed: 9 March 2022).
    [2] Vaswani, A. et al. (2017)
    'Attention Is All You Need'.
    arXiv:1706.03762 [cs] [Preprint].
    Available at: http://arxiv.org/abs/1706.03762 (Accessed: 9 March 2022).
    """

    def __init__(self,
                 # input_size: int,
                 dec_seq_len: int,
                 max_seq_len: int,
                 out_seq_len: int,
                 dim_val: int,
                 n_encoder_layers: int = 4,
                 n_decoder_layers: int = 4,
                 n_heads: int = 4,
                 dropout_encoder: float = 0.2,
                 dropout_decoder: float = 0.2,
                 dropout_pos_enc: float = 0.2,
                 dim_feedforward_encoder: int = 2048,
                 dim_feedforward_decoder: int = 2048,
                 ):
        """
        Args:
            input_size: int, number of input variables. 1 if univariate.
            dec_seq_len: int, the length of the input sequence fed to the decoder
            max_seq_len: int, length of the longest sequence the model will
                         receive. Used in positional encoding.
            out_seq_len: int, the length of the model's output (i.e. the target
                         sequence length)
            dim_val: int, aka d_model. All sub-layers in the model produce
                     outputs of dimension dim_val
            n_encoder_layers: int, number of stacked encoder layers in the encoder
            n_decoder_layers: int, number of stacked encoder layers in the decoder
            n_heads: int, the number of attention heads (aka parallel attention layers)
            dropout_encoder: float, the dropout rate of the encoder
            dropout_decoder: float, the dropout rate of the decoder
            dropout_pos_enc: float, the dropout rate of the positional encoder
            dim_feedforward_encoder: int, number of neurons in the linear layer
                                     of the encoder
            dim_feedforward_decoder: int, number of neurons in the linear layer
                                     of the decoder
        """

        super().__init__()

        self.dec_seq_len = dec_seq_len

        # The time ranges from 0 to 23 indicating the hour of the day
        self.time_embedding = nn.Embedding(num_embeddings=NUM_TIME, embedding_dim=TIME_EMBEDDING)
        # the road segments ranges from 0 to ... specifying the road segments
        self.segment_embedding = nn.Embedding(num_embeddings=NUM_SEGMENTS, embedding_dim=SEGMENT_EMBEDDING)
        self.day_embedding = nn.Embedding(num_embeddings=NUM_DAY, embedding_dim=DAY_EMBEDDING)

        # 1. create 'linear input layer' for 'encoder'
        self.encoder_input_layer = nn.Linear(in_features=NUM_CONTINUOUS, out_features=dim_continuous)

        # 2. create positional encoder
        self.positional_encoding_layer = PositionalEncoder(d_model=dim_val,
                                                           dropout=dropout_pos_enc,
                                                           max_seq_len=max_seq_len)

        # 3. create encoder layers using nn.TransformerDecoder
        # The encoder layer used in the paper is identical to the one used by
        # Vaswani et al (2017) on which the PyTorch module is based.
        encoder_layer = nn.TransformerEncoderLayer(d_model=dim_val,
                                                   nhead=n_heads,
                                                   dim_feedforward=dim_feedforward_encoder,
                                                   dropout=dropout_encoder,
                                                   batch_first=True
                                                   )

        # It seems the option of passing a normalization instance is redundant
        # in my case, because nn.TransformerEncoderLayer per default normalizes
        # after each sub-layer (https://github.com/pytorch/pytorch/issues/24930).
        self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=n_encoder_layers, norm=None)


        # 4. create 'linear input layer' for decoder
        self.decoder_input_layer = nn.Linear(in_features=1, out_features=dim_val)

        # 5. create decoder layers using nn.TransformerDecoder
        decoder_layer = nn.TransformerDecoderLayer(d_model=dim_val,
                                                   nhead=n_heads,
                                                   dim_feedforward=dim_feedforward_decoder,
                                                   dropout=dropout_decoder,
                                                   batch_first=True
                                                   )
        self.decoder = nn.TransformerDecoder(decoder_layer=decoder_layer, num_layers=n_decoder_layers, norm=None)

        # 6. create 'linear mapping layer'
        self.linear_mapping = nn.Linear(in_features=dim_val, out_features=out_seq_len)

    def forward(self, src: Tensor, tgt: Tensor, src_mask: Tensor = None, tgt_mask: Tensor = None) -> Tensor:
        """
        Args:
            src: the encoder's output sequence. Shape: (S,E) for unbatched input,
                 (S, N, E) if batch_first=False or (N, S, E) if
                 batch_first=True, where S is the source sequence length,
                 N is the batch size, and E is the feature number
            tgt: the sequence to the decoder. Shape: (T,E) for unbatched input,
                 (T, N, E)(T,N,E) if batch_first=False or (N, T, E) if
                 batch_first=True, where T is the target sequence length,
                 N is the batch size, E is the feature number.
            src_mask: the mask for the src sequence to prevent the model from
                      using data points from the target sequence
            tgt_mask: the mask for the tgt sequence to prevent the model from
                      using data points from the target sequence
        """
        time_embedding = self.time_embedding(src[:,:,0].int())
        day_embedding = self.day_embedding(src[:, :, 1].int())
        segment_embedding = self.segment_embedding(src[:, :, 2].int())

        src = self.encoder_input_layer(src[:,:,3:].float())
        # concatenate them
        src = torch.cat([time_embedding, day_embedding, segment_embedding, src], dim=2)
        # add positional
        src = self.positional_encoding_layer(src)

        # Pass through all the stacked encoder layers in the encoder
        # Masking is only needed in the encoder if input sequences are padded
        # which they are not in this time series use case, because all my
        # input sequences are naturally of the same length.
        # (https://github.com/huggingface/transformers/issues/4083)
        src = self.encoder(src=src)
        decoder_output = self.decoder_input_layer(tgt.float())
        tgt_mask = None # we are trying to keep it as simple as possible.
        decoder_output = self.decoder(tgt=decoder_output.float(), memory=src, tgt_mask=tgt_mask, memory_mask=src_mask)
        # print (decoder_output.shape)

        decoder_output = self.linear_mapping(decoder_output)
        # print(decoder_output.shape)

        return decoder_output

Writing time_series_transformer.py


In [11]:
import torch
import numpy as np
from importlib import reload
from sklearn.model_selection import train_test_split

# load data
import pandas as pd
from dataset import MyDataset
df=pd.read_pickle(drive_prefix+'overall_dataframe2.pkl')
df=df.drop(index=df[df.day_id<700].index) # only work on data from 2022 (ish)
df.sort_values(['day_id', 'segment_id_int', 'time_window'], inplace=True)
top_50_segments = df.groupby(['day_id', 'segment_id_int'])['time_window'].count()
top_50_segments = top_50_segments.groupby('segment_id_int').sum().sort_values(ascending=False).head(50)
top_50_segments.name="time_window_count"
df = df.merge(top_50_segments, left_on="segment_id_int", right_on="segment_id_int").drop(columns=["time_window_count"])

df.segment_id_int = df.segment_id_int.factorize()[0]
df.day_id=df.day_id.factorize()[0]
df.dayofweek=df.dayofweek.factorize()[0]
df.dayofweek.factorize()
df = df.dropna()  # no idea where 1 na crept up from
counts = df.groupby(['day_id', 'segment_id_int'])['time_window'].count()

buckets = {}
i = 0
j = 0
for c in counts:
  for _ in range(c-5+1):
    buckets[j]=range(i,i+5)
    i+=1
    j+=1
  i+=(5-1)

In [21]:
import os
from time_series_transformer import TimeSeriesTransformer
os.environ['CUDA_LAUNCH_BLOCKING']='1'

dataset = MyDataset(df, buckets)
train_df, test_df = torch.utils.data.random_split(dataset, (0.8, 0.2))

from config import dim_val, dec_seq_len, max_seq_len, target_seq_len, n_decoder_layers, n_encoder_layers, n_heads
tst = TimeSeriesTransformer(
    dim_val=dim_val,
    # input_size=input_size,
    dec_seq_len=dec_seq_len,
    max_seq_len=max_seq_len,
    out_seq_len=target_seq_len,
    n_decoder_layers=n_decoder_layers,
    n_encoder_layers=n_encoder_layers,
    n_heads=n_heads
)
state = torch.load("/content/drive/MyDrive/checkpoint_opt.pt")
tst.load_state_dict(state["model_state_dict"])
tst.eval()


TimeSeriesTransformer(
  (time_embedding): Embedding(96, 32)
  (segment_embedding): Embedding(50, 128)
  (day_embedding): Embedding(7, 7)
  (encoder_input_layer): Linear(in_features=8, out_features=345, bias=True)
  (positional_encoding_layer): PositionalEncoder(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
    

In [34]:
from torch.utils.data import DataLoader
from config import *
DELAY_FACTOR = 211.37619472807717

test_set = DataLoader(dataset=test_df, batch_size=1, shuffle=False, num_workers=0)
for test in test_set:
  (src, trg, trg_y) = test
  # perform inference
  delays = tst(src=src, tgt=trg, src_mask=None, tgt_mask=None)[0, -1, 0].item()
  print("Delay predicted for this segment is ", delays * DELAY_FACTOR, "seconds")
  break


Delay predicted for this segment is  39.80212773078741 seconds
