<a href="https://colab.research.google.com/github/Ogunfool/A-Transformer-Based-Model-For-Multivariate-Time-Series-Prediction-Task-Built-from-scratch-in-pytorch/blob/main/Transformers_in_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset

# Encoders Transformer Based Network

Building Transformers from scratch. The modules include:

*   Multi-head Attention
*   Transformer Block(s)

*   Positional Encoding
*   Encoder / Decoder





Pytorch Modules - Example.

---


Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes:

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Implement Multi-head Attention from Scratch.

In [None]:
# Excerpt form lazy programmer NLP course....
# Subclass the torch.nn class
class MultiHeadAttention(nn.Module):
  def __init__(self, d_k, d_model, no_heads):
    super().__init__()
    # d_k = key dimension, d_q = query dimension and d_v = value dimension
    # d_k and d_q have the same dimension, d_v is also assumed to be d_k
    self.d_k = d_k
    self.n_heads = no_heads

    # input feature size = size of x = (N,T,d_model)
    # multi-head vectorized implementation 
    self.key = nn.Linear(d_model, d_k * no_heads)
    self.query = nn.Linear(d_model, d_k * no_heads)
    self.value = nn.Linear(d_model, d_k * no_heads)

    # final linear layer - To convert N x T x d_v to N x T x d_model
    self.fc = nn.Linear(d_k * no_heads, d_model)

  def forward(self, x, mask=None):
    q = self.query(x) # N x T x (h*d_k)
    k = self.key(x)   # N x T x (h*d_k)
    v = self.value(x) # N x T x (h*d_v)

    N = q.shape[0]
    T = q.shape[1]

    # change the shape to:
    # (N, T, h, d_k) -> (N, h, T, d_k)
    # in order for matrix multiply to work properly
    q = q.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
    k = k.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
    v = v.view(N, T, self.n_heads, self.d_k).transpose(1, 2)

    # compute attention weights
    # (N, h, T, d_k) x (N, h, d_k, T) --> (N, h, T, T)
    attn_scores = q @ k.transpose(-2, -1) / math.sqrt(self.d_k)
    if mask is not None:
      attn_scores = attn_scores.masked_fill(
          mask[:, None, None, :] == 0, float('-inf'))
    attn_weights = F.softmax(attn_scores, dim=-1)
    
    # compute attention-weighted values
    # (N, h, T, T) x (N, h, T, d_k) --> (N, h, T, d_k)
    A = attn_weights @ v

    # reshape it back before final linear layer
    A = A.transpose(1, 2) # (N, T, h, d_k)
    A = A.contiguous().view(N, T, self.d_k * self.n_heads) # (N, T, h*d_k)

    # projection
    return self.fc(A)

In [None]:
class TransformerBlock(nn.Module):
  def __init__(self, d_k, d_model, no_heads, dropout_prob=0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    # You can assign submodules as regular class attributes....
    self.mha = MultiHeadAttention(d_k, d_model, no_heads)
    # You can stack the feed forward NN and add dropouts as many as required
    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model * 4),
        nn.GELU(),
        nn.Linear(d_model * 4, d_model),
        nn.Dropout(dropout_prob),
    )
    self.dropout = nn.Dropout(p=dropout_prob)
  
  def forward(self, x, mask=None):
    x = self.ln1(x + self.mha(x, mask))
    x = self.ln2(x + self.ann(x))
    x = self.dropout(x)
    return x

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=2048, dropout_prob=0.1):
    super().__init__()
    self.dropout = nn.Dropout(p=dropout_prob)

    position = torch.arange(max_len).unsqueeze(1)
    exp_term = torch.arange(0, d_model, 2)
    div_term = torch.exp(exp_term * (-math.log(10000.0) / d_model))
    pe = torch.zeros(1, max_len, d_model)
    pe[0, :, 0::2] = torch.sin(position * div_term)
    pe[0, :, 1::2] = torch.cos(position * div_term)
    self.register_buffer('pe', pe)

  def forward(self, x):
    # x.shape: N x T x D
    x = x + self.pe[:, :x.size(1), :]
    return self.dropout(x)

In [None]:
class Encoder(nn.Module):
  def __init__(self,
               vocab_size,
               max_len,
               d_k,
               d_model,
               no_heads,
               n_layers,
               n_classes,
               dropout_prob):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformer_blocks = [TransformerBlock(d_k, d_model, no_heads, dropout_prob) for _ in range(n_layers)]
    self.transformer_blocks = nn.Sequential(*transformer_blocks)
    self.ln = nn.LayerNorm(d_model)
    self.fc = nn.Linear(d_model, n_classes)
  
  def forward(self, x, mask=None):
    x = self.embedding(x)
    x = self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(x, mask)

    # many-to-one (x has the shape N x T x D)
    # Take the first T row
    x = x[:, 0, :]

    x = self.ln(x)
    x = self.fc(x)
    return x

In [None]:
model = Encoder(20_000, 1024, 16, 64, 4, 2, 5, 0.1)

In [None]:
# Create device object to the GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# Move model to the gpu if available
model.to(device)

cuda:0


Encoder(
  (embedding): Embedding(20000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, 

In [None]:
# Generate random dataset for code testing
# T = sequence lenght
x = np.random.randint(0, 20_000, size=(8, 512)) # N = 8, T = 512
x_t = torch.tensor(x).to(device)

# Generate data mask
mask = np.ones((8, 512))
mask[:, 256:] = 0
mask_t = torch.tensor(mask).to(device)

In [None]:
# Expected output size = N x num_classes
y = model(x_t, mask_t)
y.shape

torch.Size([8, 5])

# Sentiment Analysis With Above Model.

Get, Preprocess and Load Data - Hugging Face. 

*   Use an Autotokenizer and DataCollatorWithPadding from hugging face library to tokenize data and generate padding mask.
*   Load data with torc dataloader.



In [None]:
! pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.5 MB/s[0m eta [36m0:00:00[0m
Co

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [None]:
from datasets import load_dataset
raw_datasets = load_dataset("glue", "sst2")

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading and preparing dataset glue/sst2 to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [None]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation=True)
# map toeknize function to dataset
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

Load datasets.

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator
)
valid_loader = DataLoader(
    tokenized_datasets["validation"],
    batch_size=32,
    collate_fn=data_collator
)

In [None]:
# check how it works
for batch in train_loader:
  for k, v in batch.items():
    print("k:", k, "v.shape:", v.shape)
  break

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


k: labels v.shape: torch.Size([32])
k: input_ids v.shape: torch.Size([32, 45])
k: attention_mask v.shape: torch.Size([32, 45])


In [None]:
batch.items

<bound method BatchEncoding.items of {'labels': tensor([0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
        1, 1, 1, 0, 0, 1, 1, 0]), 'input_ids': tensor([[  101,   179, 15818,  ...,     0,     0,     0],
        [  101,  1110,   170,  ...,     0,     0,     0],
        [  101,  1106,  3994,  ...,     0,     0,     0],
        ...,
        [  101,  1128,  1431,  ...,     0,     0,     0],
        [  101,  1122,   112,  ...,     0,     0,     0],
        [  101,  1104,  1103,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}>

Encoder Model

In [None]:
checkpoint = 'distilbert-base-cased'
model = Encoder(
    vocab_size=tokenizer.vocab_size,
    max_len=tokenizer.max_model_input_sizes[checkpoint],
    d_k=16,
    d_model=64,
    no_heads=4,
    n_layers=2,
    n_classes=2,
    dropout_prob=0.1,
)
model.to(device)

Instantiate Model Loss, Optimizer(s) and train.

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [None]:
from datetime import datetime
# Training loop
def train(model, criterion, optimizer, train_loader, valid_loader, epochs):
  train_losses = np.zeros(epochs)
  test_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()   # Model in training mode
    t0 = datetime.now()
    train_loss = 0
    n_train = 0
    for batch in train_loader:
      # move data to GPU
      batch = {k: v.to(device) for k, v in batch.items()}

      # zero the parameter gradients
      optimizer.zero_grad()

      # Forward pass
      outputs = model(batch['input_ids'], batch['attention_mask'])
      loss = criterion(outputs, batch['labels'])
        
      # Backward and optimize
      loss.backward()   # Compute Gradients (Back prop)
      optimizer.step()  # Update weights(GD/Adam)

      train_loss += loss.item()*batch['input_ids'].size(0)
      n_train += batch['input_ids'].size(0)

    # Get average train loss
    train_loss = train_loss / n_train
    
    # Evalaute model at the end of each epoch
    model.eval()
    test_loss = 0
    n_test = 0
    for batch in valid_loader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(batch['input_ids'], batch['attention_mask'])
      loss = criterion(outputs, batch['labels'])
      test_loss += loss.item()*batch['input_ids'].size(0)
      n_test += batch['input_ids'].size(0)
    test_loss = test_loss / n_test

    # Save losses
    train_losses[it] = train_loss
    test_losses[it] = test_loss
    
    dt = datetime.now() - t0
    print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
      Test Loss: {test_loss:.4f}, Duration: {dt}')
  
  return train_losses, test_losses

Now Train

In [None]:
train(model, criterion, optimizer, train_loader, valid_loader, epochs=4)

Epoch 1/4, Train Loss: 0.4973,       Test Loss: 0.5238, Duration: 0:00:20.227275
Epoch 2/4, Train Loss: 0.3547,       Test Loss: 0.5192, Duration: 0:00:17.718122
Epoch 3/4, Train Loss: 0.2958,       Test Loss: 0.5052, Duration: 0:00:18.772279
Epoch 4/4, Train Loss: 0.2567,       Test Loss: 0.5356, Duration: 0:00:18.641025


(array([0.49727873, 0.35472264, 0.29578105, 0.25674538]),
 array([0.52382502, 0.51919762, 0.50519833, 0.53563062]))

Accuracy Evaluation

In [None]:
# Accuracy
model.eval()
n_correct = 0.
n_total = 0.
for batch in train_loader:
  # Move to GPU
  batch = {k: v.to(device) for k, v in batch.items()}

  # Forward pass
  outputs = model(batch['input_ids'], batch['attention_mask'])

  # Get prediction
  # torch.max returns both max and argmax
  _, predictions = torch.max(outputs, 1)
  
  # update counts
  n_correct += (predictions == batch['labels']).sum().item()
  n_total += batch['labels'].shape[0]

train_acc = n_correct / n_total


n_correct = 0.
n_total = 0.
for batch in valid_loader:
  # Move to GPU
  batch = {k: v.to(device) for k, v in batch.items()}
  
  # Forward pass
  outputs = model(batch['input_ids'], batch['attention_mask'])

  # Get prediction
  # torch.max returns both max and argmax
  _, predictions = torch.max(outputs, 1)
  
  # update counts
  n_correct += (predictions == batch['labels']).sum().item()
  n_total += batch['labels'].shape[0]

test_acc = n_correct / n_total
print(f"Train acc: {train_acc:.4f}, Test acc: {test_acc:.4f}")

Train acc: 0.9362, Test acc: 0.7959


# Time Series Data Prediction With Above Model.

Modify Encoder For Time series data and Regression task. Take note --


*   Mask = None, because the sequence lenght does not change given the way the data was prepared.
*   No embedding layer is required and the positional encoding will be added to x inputs directly.


*   max_lenght = T





In [None]:
T = 5    # Also max_lenght
D = 8    # Also d_model
K = 1    # Also num_classes

In [None]:
# Excerpt form lazy programmer NLP course....
# Subclass the torch.nn class
class MultiHeadAttention(nn.Module):
  def __init__(self, d_k, d_model, no_heads):
    super().__init__()
    # d_k = key dimension, d_q = query dimension and d_v = value dimension
    # d_k and d_q have the same dimension, d_v is also assumed to be d_k
    self.d_k = d_k
    self.n_heads = no_heads

    # input feature size = size of x = (N,T,d_model)
    # multi-head vectorized implementation 
    self.key = nn.Linear(d_model, d_k * no_heads)
    self.query = nn.Linear(d_model, d_k * no_heads)
    self.value = nn.Linear(d_model, d_k * no_heads)

    # final linear layer - To convert N x T x d_v to N x T x d_model
    self.fc = nn.Linear(d_k * no_heads, d_model)

  def forward(self, x, mask=None):
    q = self.query(x) # N x T x (h*d_k)
    k = self.key(x)   # N x T x (h*d_k)
    v = self.value(x) # N x T x (h*d_v)

    N = q.shape[0]
    T = q.shape[1]

    # change the shape to:
    # (N, T, h, d_k) -> (N, h, T, d_k)
    # in order for matrix multiply to work properly
    q = q.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
    k = k.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
    v = v.view(N, T, self.n_heads, self.d_k).transpose(1, 2)

    # compute attention weights
    # (N, h, T, d_k) x (N, h, d_k, T) --> (N, h, T, T)
    attn_scores = q @ k.transpose(-2, -1) / math.sqrt(self.d_k)
    if mask is not None:
      attn_scores = attn_scores.masked_fill(
          mask[:, None, None, :] == 0, float('-inf'))
    attn_weights = F.softmax(attn_scores, dim=-1)
    
    # compute attention-weighted values
    # (N, h, T, T) x (N, h, T, d_k) --> (N, h, T, d_k)
    A = attn_weights @ v

    # reshape it back before final linear layer
    A = A.transpose(1, 2) # (N, T, h, d_k)
    A = A.contiguous().view(N, T, self.d_k * self.n_heads) # (N, T, h*d_k)

    # projection
    return self.fc(A)

In [None]:
# Returns NxTxD_model
model = MultiHeadAttention(4,8,3)
model.to(device)
y = model(x_t)
y.shape

torch.Size([41452, 5, 8])

In [None]:
class TransformerBlock(nn.Module):
  def __init__(self, d_k, d_model, no_heads, dropout_prob=0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    # You can assign submodules as regular class attributes....
    self.mha = MultiHeadAttention(d_k, d_model, no_heads)
    # You can stack the feed forward NN and add dropouts as many as required
    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model * 4),
        nn.GELU(),
        nn.Dropout(dropout_prob),
        nn.Linear(d_model * 4, d_model),
        nn.Dropout(dropout_prob),
    )
    self.dropout = nn.Dropout(p=dropout_prob)
  
  def forward(self, x, mask=None):
    x = self.ln1(x + self.mha(x, mask))
    x = self.ln2(x + self.ann(x))
    x = self.dropout(x)
    return x

In [None]:
# Return NxTxD_model
model = TransformerBlock(4, 8, 3, 0.2)
model.to(device)
y = model(x_t)
y.shape

torch.Size([41452, 5, 8])

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len, dropout_prob):
    super().__init__()
    self.dropout = nn.Dropout(p=dropout_prob)

    position = torch.arange(max_len).unsqueeze(1)
    exp_term = torch.arange(0, d_model, 2)
    div_term = torch.exp(exp_term * (-math.log(10000.0) / d_model))
    pe = torch.zeros(1, max_len, d_model)
    pe[0, :, 0::2] = torch.sin(position * div_term)
    pe[0, :, 1::2] = torch.cos(position * div_term)
    self.register_buffer('pe', pe)

  def forward(self, x):
    # x.shape: N x T x D
    x = x + self.pe[:, :x.size(1), :]
    return self.dropout(x)

In [None]:
model = PositionalEncoding(8,5,0.2)
model.to(device)
y = model(x_t)
y.shape

torch.Size([41452, 5, 8])

In [None]:
class Encoder(nn.Module):
  def __init__(self,
               max_len,
               d_k,
               d_model,
               no_heads,
               n_layers,
               dropout_prob):
    super().__init__()

    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformer_blocks = [TransformerBlock(d_k, d_model, no_heads, dropout_prob) for _ in range(n_layers)]
    self.transformer_blocks = nn.Sequential(*transformer_blocks)
    self.ln = nn.LayerNorm(d_model)
    self.fc = nn.Linear(d_model, 1)
  
  def forward(self, x, mask=None):
    x = self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(x, mask)

    # many-to-one (NxTx1 --> Nx1)
    # Take the first T row
    x = x[:, 0, :]

    x = self.ln(x)
    x = self.fc(x)
    return x

In [None]:
model = Encoder(5, 2, 8, 2, 3, 0.4)

In [None]:
# Create device object to the GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# Move model to the gpu if available
model.to(device)

Load Data 

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
NT_y_train = np.load('/content/NT-Y-train (1).npy')
NT_y_val = np.load('/content/NT-Y-val (1).npy')

In [None]:
NTD_x_train = np.load('/content/NTD-X-train.npy')
NTD_x_val = np.load('/content/NTD-X-val.npy')

Test with time series data of shape NxTxD

In [None]:
# model.fc1.weight.dtype

In [None]:
print(NTD_x_train.shape)
x_t = torch.tensor(NTD_x_train).type('torch.FloatTensor').to(device)
print(x_t.shape, type(x_t))

(41452, 5, 8)
torch.Size([41452, 5, 8]) <class 'torch.Tensor'>


In [None]:
# Expect an output of NxK = 41452,1
y = model(x_t)
y.shape

torch.Size([41452, 1])

Now Let's Train and Use Torch dataloader to batch data

In [None]:
NT_y_train.reshape(-1,1).shape

(41452, 1)

In [None]:
x_train = torch.tensor(NTD_x_train).type('torch.FloatTensor')
x_val = torch.tensor(NTD_x_val).type('torch.FloatTensor')
y_train = torch.tensor(NT_y_train.reshape(-1,1)).type('torch.FloatTensor')
y_val = torch.tensor(NT_y_val.reshape(-1,1)).type('torch.FloatTensor')

In [None]:
from torch.utils.data import TensorDataset
train_dataset = TensorDataset(x_train, y_train)
val_dataset = TensorDataset(x_val, y_val)

In [None]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset,
    batch_size=32)

valid_loader = DataLoader(val_dataset,
    batch_size=32)

In [None]:
# check how it works
for batch in train_loader:
  k=batch[0]
  v=batch[1]
  print("k:", k.shape, "v.shape:", v.shape)
  break

k: torch.Size([32, 5, 8]) v.shape: torch.Size([32, 1])


In [None]:
batch[0].shape[0]

32

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

In [None]:
from datetime import datetime
# Training loop
def train(model, criterion, optimizer, train_loader, valid_loader, epochs):
  train_losses = np.zeros(epochs)
  test_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()   # Model in training mode
    t0 = datetime.now()
    train_loss = 0
    n_train = 0
    for batch in train_loader:
      # move data to GPU
      x_c = batch[0].cuda()
      y_c = batch[1].cuda()

      # zero the parameter gradients
      optimizer.zero_grad()

      # Forward pass
      outputs = model(x_c)
      loss = criterion(outputs, y_c)
        
      # Backward and optimize
      loss.backward()   # Compute Gradients (Back prop)
      optimizer.step()  # Update weights(GD/Adam)

      train_loss += loss.item()*(batch[0].shape[0])
      n_train += batch[0].shape[0]

    # Get average train loss
    train_loss = train_loss / n_train
    
    # Evalaute model at the end of each epoch
    model.eval()
    test_loss = 0
    n_test = 0
    for batch in valid_loader:
      # move data to GPU
      x_c = batch[0].cuda()
      y_c = batch[1].cuda()

      # Forward pass
      outputs = model(x_c)
      loss = criterion(outputs, y_c)
      test_loss += loss.item()*(batch[0].shape[0])
      n_test += (batch[0].shape[0])
    test_loss = test_loss / n_test

    # Save losses
    train_losses[it] = train_loss
    test_losses[it] = test_loss
    
    dt = datetime.now() - t0
    # print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, Duration: {dt}')

    print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
      Test Loss: {test_loss:.4f}, Duration: {dt}')
  
  # return train_losses, test_losses
  return train_losses

Now Train

In [None]:
train(model, criterion, optimizer, train_loader, valid_loader, epochs=40)

Epoch 1/40, Train Loss: 0.1082,       Test Loss: 0.0668, Duration: 0:00:11.225497
Epoch 2/40, Train Loss: 0.0479,       Test Loss: 0.0377, Duration: 0:00:11.240074
Epoch 3/40, Train Loss: 0.0384,       Test Loss: 0.0338, Duration: 0:00:11.149384
Epoch 4/40, Train Loss: 0.0364,       Test Loss: 0.0317, Duration: 0:00:11.160236
Epoch 5/40, Train Loss: 0.0350,       Test Loss: 0.0306, Duration: 0:00:10.831368
Epoch 6/40, Train Loss: 0.0340,       Test Loss: 0.0318, Duration: 0:00:10.779368
Epoch 7/40, Train Loss: 0.0330,       Test Loss: 0.0314, Duration: 0:00:11.995799
Epoch 8/40, Train Loss: 0.0330,       Test Loss: 0.0287, Duration: 0:00:11.397091
Epoch 9/40, Train Loss: 0.0326,       Test Loss: 0.0298, Duration: 0:00:11.478709
Epoch 10/40, Train Loss: 0.0322,       Test Loss: 0.0290, Duration: 0:00:11.353067
Epoch 11/40, Train Loss: 0.0317,       Test Loss: 0.0284, Duration: 0:00:11.243980
Epoch 12/40, Train Loss: 0.0313,       Test Loss: 0.0289, Duration: 0:00:11.471582
Epoch 13/40, 

array([0.10824946, 0.0478946 , 0.03839886, 0.03638623, 0.03498856,
       0.03404441, 0.0329901 , 0.0329567 , 0.03256513, 0.03215574,
       0.03169486, 0.03126598, 0.03113167, 0.03100631, 0.03085495,
       0.03053572, 0.03048201, 0.03015668, 0.03017455, 0.03015544,
       0.03004804, 0.03014352, 0.0297592 , 0.02970045, 0.02919757,
       0.0295211 , 0.02955804, 0.02978716, 0.02951657, 0.02949136,
       0.0298884 , 0.02975253, 0.02926124, 0.02934393, 0.02935099,
       0.0293865 , 0.02889427, 0.02924933, 0.02887025, 0.02904667])