In [1]:
import pickle

import torch
import torch.nn as nn
import torch.optim as optim

import math
import numpy as np
import pandas as pd

In [2]:
objects = []
# Encoded sequences:  0 -> 101
with (open("encoded_sequences.pkl", "rb")) as openfile:
    while True:
        try:
            objects.append(pickle.load(openfile))
        except EOFError:
            break

encoded_sequences = objects[0]
print(f"Length of data: {len(encoded_sequences)}")
print(f"Ex: {encoded_sequences[0]}")


Length of data: 6873
Ex: [18, 18, 18, 18, 1, 6]


In [3]:
objects

[[[18, 18, 18, 18, 1, 6],
  [97, 4],
  [59, 1, 6],
  [4, 97],
  [54, 1, 6],
  [54, 1, 6],
  [49, 49, 49, 1, 6],
  [26, 1, 6],
  [6, 26, 1],
  [26, 1, 6],
  [64, 1, 6],
  [6, 64, 1],
  [6, 66, 66, 1],
  [6, 43, 1, 43, 43],
  [54, 54, 54, 1, 6],
  [18, 18, 18, 18, 1, 6],
  [48, 6],
  [6, 18, 1],
  [8, 4],
  [6, 64, 1],
  [6, 64, 1],
  [51, 1, 6],
  [6, 26, 1],
  [6, 26, 1],
  [26, 1, 6],
  [6, 26, 1],
  [26, 1, 6],
  [6, 26, 1],
  [64, 1, 6],
  [6, 65, 1],
  [54, 1, 6],
  [88, 4],
  [6, 60, 1],
  [64, 1, 6],
  [6, 27, 1],
  [65, 1, 6],
  [6, 65, 1],
  [65, 1, 6],
  [6, 65, 1],
  [65, 1, 6],
  [6, 65, 1],
  [65, 1, 6],
  [6, 65, 1],
  [49, 6],
  [6, 64, 1],
  [31, 1, 6],
  [6, 31, 1],
  [30, 1, 6],
  [23, 6],
  [64, 1, 6],
  [52, 6],
  [6, 53, 1],
  [6, 65, 65, 1],
  [6, 53, 1],
  [64, 1, 6],
  [66, 1, 6],
  [6, 57, 1],
  [6, 57, 1],
  [6, 64, 1],
  [6, 64, 1],
  [66, 1, 6],
  [6, 64, 1],
  [6, 64, 1],
  [64, 1, 6],
  [64, 1, 6],
  [6, 54, 1],
  [64, 1, 6],
  [6, 30, 1],
  [55, 1, 6],
  [

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, dim_model, dropout_p, max_len):
        super().__init__()
        # Modified version from: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
        # max_len determines how far the position can have an effect on a token (window)
        
        # Info
        self.dropout = nn.Dropout(dropout_p)
        
        # Encoding - From formula
        pos_encoding = torch.zeros(max_len, dim_model)
        positions_list = torch.arange(0, max_len, dtype=torch.float).view(-1, 1) # 0, 1, 2, 3, 4, 5
        division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0)) / dim_model) # 1000^(2i/dim_model)
        
        # PE(pos, 2i) = sin(pos/1000^(2i/dim_model))
        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)
        
        # PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model))
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)
        
        # Saving buffer (same as parameter without gradients needed)
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pos_encoding",pos_encoding)
        
    def forward(self, token_embedding: torch.tensor) -> torch.tensor:
        # Residual connection + pos encoding
        return self.dropout(token_embedding + self.pos_encoding[:token_embedding.size(0), :])

In [5]:
class Transformer(nn.Module):
    """
    Model from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """
    # Constructor
    def __init__(
        self,
        num_tokens,
        dim_model,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        dropout_p,
    ):
        super().__init__()

        # INFO
        self.model_type = "Transformer"
        self.dim_model = dim_model

        # LAYERS
        self.positional_encoder = PositionalEncoding(
            dim_model=dim_model, dropout_p=dropout_p, max_len=5000
        )
        self.embedding = nn.Embedding(num_tokens, dim_model)
        self.transformer = nn.Transformer(
            d_model=dim_model,
            nhead=num_heads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dropout=dropout_p,
        )
        self.out = nn.Linear(dim_model, num_tokens)
        
    def forward(self, src, tgt, tgt_mask=None, src_pad_mask=None, tgt_pad_mask=None):
        # Src size must be (batch_size, src sequence length)
        # Tgt size must be (batch_size, tgt sequence length)

        # Embedding + positional encoding - Out size = (batch_size, sequence length, dim_model)
        src = self.embedding(src) * math.sqrt(self.dim_model)
        tgt = self.embedding(tgt) * math.sqrt(self.dim_model)
        src = self.positional_encoder(src)
        tgt = self.positional_encoder(tgt)
        
        # We could use the parameter batch_first=True, but our KDL version doesn't support it yet, so we permute
        # to obtain size (sequence length, batch_size, dim_model),
        src = src.permute(1,0,2)
        tgt = tgt.permute(1,0,2)

        # Transformer blocks - Out size = (sequence length, batch_size, num_tokens)
        transformer_out = self.transformer(src, tgt, tgt_mask=tgt_mask, src_key_padding_mask=src_pad_mask, tgt_key_padding_mask=tgt_pad_mask)
        out = self.out(transformer_out)
        
        return out
      
    def get_tgt_mask(self, size) -> torch.tensor:
        # Generates a squeare matrix where the each row allows one word more to be seen
        mask = torch.tril(torch.ones(size, size) == 1) # Lower triangular matrix
        mask = mask.float()
        mask = mask.masked_fill(mask == 0, float('-inf')) # Convert zeros to -inf
        mask = mask.masked_fill(mask == 1, float(0.0)) # Convert ones to 0
        
        # EX for size=5:
        # [[0., -inf, -inf, -inf, -inf],
        #  [0.,   0., -inf, -inf, -inf],
        #  [0.,   0.,   0., -inf, -inf],
        #  [0.,   0.,   0.,   0., -inf],
        #  [0.,   0.,   0.,   0.,   0.]]
        
        return mask
    
    def create_pad_mask(self, matrix: torch.tensor, pad_token: int) -> torch.tensor:
        # If matrix = [1,2,3,0,0,0] where pad_token=0, the result mask is
        # [False, False, False, True, True, True]
        return (matrix == pad_token)

In [6]:
def make_data(encoded_sequences):

    # Vi har encode:at sekvenserna till 0->101
    SOS_token = np.array([200])
    EOS_token = np.array([300])
    length = 16

    data = []

    for seq in encoded_sequences:
        for i in range(1, len(seq)):
            #data.append((seq[:i], seq[i]))
            #X_seq, y_seq = seq[:i], seq[i:]
            X_seq = seq[:i+1]
            X = np.concatenate((SOS_token, X_seq, EOS_token))
            y_seq = X[i+1:]
            y = np.concatenate((SOS_token, y_seq, EOS_token))

            data.append([X, y])

    #np.random.shuffle(data)

    return data


def batchify_data(data, batch_size=16, padding=True, padding_token=400):
    batches = []
    for idx in range(0, len(data), batch_size):
        # We make sure we dont get the last bit if its not batch_size size
        if idx + batch_size < len(data):
            # Here you would need to get the max length of the batch,
            # and normalize the length with the PAD token.
            if padding:
                max_batch_length = 0

                # Get longest sentence in batch
                for seq in data[idx : idx + batch_size]:
                    if len(seq[1]) > max_batch_length:
                        max_batch_length = len(seq[1])
                #print(max_batch_length)
                # Append X padding tokens until it reaches the max length
                for seq_idx in range(batch_size):
                    remaining_length_0 = max_batch_length - len(data[idx + seq_idx][0])
                    remaining_length_1 = max_batch_length - len(data[idx + seq_idx][1])
                    #data[idx + seq_idx] += [padding_token] * remaining_length
                    data[idx + seq_idx][0] = np.concatenate((data[idx + seq_idx][0], [padding_token] * remaining_length_0))
                    data[idx + seq_idx][1] = np.concatenate((data[idx + seq_idx][1], [padding_token] * remaining_length_1))

            batches.append(np.array(data[idx : idx + batch_size]))#.astype(np.int64))

    print(f"{len(batches)} batches of size {batch_size}")

    return batches


train_data = make_data(encoded_sequences)
#print(train_data[0])
#train_dataloader = batchify_data(train_data)
#print(train_data[0])
#val_dataloader = batchify_data(val_data)

In [7]:
# Hittar maxlängd på train_data samt index
max_len = 0
for a in train_data:
    
    new_max_len = max(max_len, len(a[0]))
    if new_max_len != max_len:
        print(f"New max len: {len(a[0])}")
        print(f"{a}\n")
        max_len = new_max_len

print(new_max_len)


New max len: 4
[array([200,  18,  18, 300]), array([200,  18, 300, 300])]

New max len: 5
[array([200,  18,  18,  18, 300]), array([200,  18, 300, 300])]

New max len: 6
[array([200,  18,  18,  18,  18, 300]), array([200,  18, 300, 300])]

New max len: 7
[array([200,  18,  18,  18,  18,   1, 300]), array([200,   1, 300, 300])]

New max len: 8
[array([200,  18,  18,  18,  18,   1,   6, 300]), array([200,   6, 300, 300])]

New max len: 9
[array([200,  64,  73,  76,  64,  73,  76,  73, 300]), array([200,  73, 300, 300])]

New max len: 10
[array([200,  64,  73,  76,  64,  73,  76,  73,  76, 300]), array([200,  76, 300, 300])]

New max len: 11
[array([200,  64,  73,  76,  64,  73,  76,  73,  76,  64, 300]), array([200,  64, 300, 300])]

New max len: 12
[array([200,  64,  73,  76,  64,  73,  76,  73,  76,  64,  73, 300]), array([200,  73, 300, 300])]

New max len: 13
[array([200,  64,  73,  76,  64,  73,  76,  73,  76,  64,  73,  76, 300]), array([200,  76, 300, 300])]

New max len: 14
[arra

In [8]:
# Hittar maxlängd på encoded_sequence samt index
max_len = 0
for idx, seq in enumerate(encoded_sequences):
    new_max_len = max(max_len, len(seq))
    if new_max_len != max_len:
        print(f"Val: {new_max_len}, idx: {idx}")
        max_len = new_max_len
    
print(max_len)

Val: 6, idx: 0
Val: 13, idx: 916
Val: 16, idx: 2723
16


In [137]:
len(encoded_sequences[2723])

16

In [127]:
max_len, idx1, idx2 = 0, 0, 0

for i1, a in enumerate(train_dataloader):
    for i2, b in enumerate(a):
        new_max_len = max(max_len, len(b[1]))
        if new_max_len != max_len:
            idx1, idx2 = i1, i2
            max_len = new_max_len
        #print(len(b))
        #break
    #break

print(max_len)
print(f"idx1: {idx1}, idx2: {idx2}")

17
idx1: 341, idx2: 0


In [133]:
train_dataloader[341]

array([[[200.,   6., 300., 400., 400., 400., 400., 400., 400., 400.,
         400., 400., 400., 400., 400., 400., 400.],
        [200.,  66.,   1.,   9.,   3., 300., 400., 400., 400., 400.,
         400., 400., 400., 400., 400., 400., 400.]],

       [[200.,   6.,  66., 300., 400., 400., 400., 400., 400., 400.,
         400., 400., 400., 400., 400., 400., 400.],
        [200.,   1.,   9.,   3., 300., 400., 400., 400., 400., 400.,
         400., 400., 400., 400., 400., 400., 400.]],

       [[200.,   6.,  66.,   1., 300., 400., 400., 400., 400., 400.,
         400., 400., 400., 400., 400., 400., 400.],
        [200.,   9.,   3., 300., 400., 400., 400., 400., 400., 400.,
         400., 400., 400., 400., 400., 400., 400.]],

       [[200.,   6.,  66.,   1.,   9., 300., 400., 400., 400., 400.,
         400., 400., 400., 400., 400., 400., 400.],
        [200.,   3., 300., 400., 400., 400., 400., 400., 400., 400.,
         400., 400., 400., 400., 400., 400., 400.]],

       [[200.,  64., 300

**train_dataloader - batch x batch_size x 2 (X, y) x length**

In [107]:
print(f"Num batches: {len(train_dataloader)}")
print(f"Shape of each batch: {train_dataloader[0].shape}")
print("\nExample from sample of batch:")
train_dataloader[0][0]

Num batches: 859
Shape of each batch: (16, 2, 7)

Example from sample of batch:


array([[200.,  18., 300., 400., 400., 400., 400.],
       [200.,  18.,  18.,  18.,   1.,   6., 300.]])

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Transformer(
    num_tokens=4, dim_model=8, num_heads=2, num_encoder_layers=3, num_decoder_layers=3, dropout_p=0.1
).to(device)
opt = torch.optim.SGD(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()

In [9]:
def train_loop(model, opt, loss_fn, dataloader):
    """
    Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """
    
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        X, y = batch[:, 0], batch[:, 1]
        X, y = torch.tensor(X).to(device), torch.tensor(y).to(device)

        # Now we shift the tgt by one so with the <SOS> we predict the token at pos 1
        y_input = y[:,:-1]
        y_expected = y[:,1:]
        
        # Get mask to mask out the next words
        sequence_length = y_input.size(1)
        tgt_mask = model.get_tgt_mask(sequence_length).to(device)

        # Standard training except we pass in y_input and tgt_mask
        pred = model(X, y_input, tgt_mask)

        # Permute pred to have batch size first again
        pred = pred.permute(1, 2, 0)      
        loss = loss_fn(pred, y_expected)

        opt.zero_grad()
        loss.backward()
        opt.step()
    
        total_loss += loss.detach().item()
        
    return total_loss / len(dataloader)

In [10]:
def fit(model, opt, loss_fn, train_dataloader, val_dataloader, epochs):
    """
    Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """
    
    # Used for plotting later on
    train_loss_list = []
    #validation_loss_list = []
    
    print("Training and validating model")
    for epoch in range(epochs):
        print("-"*25, f"Epoch {epoch + 1}","-"*25)
        
        train_loss = train_loop(model, opt, loss_fn, train_dataloader)
        train_loss_list += [train_loss]
        
        #validation_loss = validation_loop(model, loss_fn, val_dataloader)
        #validation_loss_list += [validation_loss]
        
        print(f"Training loss: {train_loss:.4f}")
        #print(f"Validation loss: {validation_loss:.4f}")
        print()
        
    return train_loss_list, validation_loss_list
    
train_loss_list, validation_loss_list = fit(model, opt, loss_fn, train_dataloader, None, 10)


Training and validating model
------------------------- Epoch 1 -------------------------


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [40]:
encoded_sequences

[[18, 18, 18, 18, 1, 6],
 [97, 4],
 [59, 1, 6],
 [4, 97],
 [54, 1, 6],
 [54, 1, 6],
 [49, 49, 49, 1, 6],
 [26, 1, 6],
 [6, 26, 1],
 [26, 1, 6],
 [64, 1, 6],
 [6, 64, 1],
 [6, 66, 66, 1],
 [6, 43, 1, 43, 43],
 [54, 54, 54, 1, 6],
 [18, 18, 18, 18, 1, 6],
 [48, 6],
 [6, 18, 1],
 [8, 4],
 [6, 64, 1],
 [6, 64, 1],
 [51, 1, 6],
 [6, 26, 1],
 [6, 26, 1],
 [26, 1, 6],
 [6, 26, 1],
 [26, 1, 6],
 [6, 26, 1],
 [64, 1, 6],
 [6, 65, 1],
 [54, 1, 6],
 [88, 4],
 [6, 60, 1],
 [64, 1, 6],
 [6, 27, 1],
 [65, 1, 6],
 [6, 65, 1],
 [65, 1, 6],
 [6, 65, 1],
 [65, 1, 6],
 [6, 65, 1],
 [65, 1, 6],
 [6, 65, 1],
 [49, 6],
 [6, 64, 1],
 [31, 1, 6],
 [6, 31, 1],
 [30, 1, 6],
 [23, 6],
 [64, 1, 6],
 [52, 6],
 [6, 53, 1],
 [6, 65, 65, 1],
 [6, 53, 1],
 [64, 1, 6],
 [66, 1, 6],
 [6, 57, 1],
 [6, 57, 1],
 [6, 64, 1],
 [6, 64, 1],
 [66, 1, 6],
 [6, 64, 1],
 [6, 64, 1],
 [64, 1, 6],
 [64, 1, 6],
 [6, 54, 1],
 [64, 1, 6],
 [6, 30, 1],
 [55, 1, 6],
 [64, 1, 6],
 [55, 1, 6],
 [54, 1, 6],
 [66, 1, 6],
 [6, 42, 1],
 [6, 42