In [None]:
# Encoder/decoder transformer, like in the paper

# Single Encoder block

# Embed input into a length 512 vector
# Add positional encoding to the input
# Alternating sin and cosine functions across the embedding dimension (512)
# i = position in the embedding dimension
#PE(pos,2i) = sin(pos/100002i/dmodel )
#PE(pos,2i+1) = cos(pos/100002i/dmodel
# Sum positional encoding and input
# apply dropout with chance .1

# Self-attention
# Each token in the input sequence is a query
# Every other token is a key
# Project query and key into a new dimensional space with a linear transform (this is multi-head attention)
# We do the following 16 times for multi-head attention
# Dot the projects query with the projected key matrix
# QK
# Attention(Q, K, V ) = softmax( QKT / √dk)V
# √dk is the square root of the value dimension
# Keys and values are the same in this particular case
# Basically, compute an attention score (scalar) between each query and each key, then scale the value by that number
# So less relevant other tokens are minimized
# Concat the results of the attention equation
# Run through another linear layer to reproject
# Each attention head outputs 1/16th of the input embedding len

# After doing multi-head attention (16)
# Apply dropout with chance .1
# Add the input to the layer (original embedded sequences) and the output of attention
# Run layer normalization (unclear which layer norm to use)

# Run a feed forward network
# Add input to the layer to the output of the ff network
# Normalize again

# Single decoder block

# Shift outputs right, to start with start token
# Do positional encoding
# Do dropout with chance .1
# Mask outputs, so queries can only see keys that came before the query
# Run multi-head attention
# Add and norm

# Run multi-head attention again, but this time v,k is from encoder stack, and q is from decoder stack
# Apply dropout with chance .1
# When doing add and norm, add in the decoder stack input

# Feed forward

# At top of stack, do another linear layer and softmax

# Might want to move layer norm inside the residual block - https://arxiv.org/pdf/2002.04745.pdf
# Layer normalization - https://arxiv.org/pdf/1607.06450.pdf

In [1]:
import numpy as np
import torch
from torch import nn
import functorch
import sys
import os
import math
sys.path.append(os.path.abspath("../../data"))

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 1
SP_VOCAB_SIZE = 1000
TRAIN_SIZE = 500

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from text_data import CNNDatasetWrapper

class Wrapper(CNNDatasetWrapper):
    split_lengths = [TRAIN_SIZE, math.floor(TRAIN_SIZE * .1), 100]
    x_length = 15
    target_length = 15

wrapper = Wrapper(SP_VOCAB_SIZE, DEVICE)

datasets = wrapper.generate_datasets(BATCH_SIZE)
train = datasets["train"]
valid = datasets["validation"]

Found cached dataset cnn_dailymail (/Users/vik/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
100%|██████████| 3/3 [00:00<00:00, 45.27it/s]
sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=tokens.txt --model_prefix=cnn_dailymail --vocab_size=1000 --model_type=unigram
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: tokens.txt
  input_format: 
  model_prefix: cnn_dailymail
  model_type: UNIGRAM
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piec

In [7]:
x, y, prev_y = train.dataset[0]

In [9]:
embed = nn.Embedding(wrapper.vocab_size, 512)

In [13]:
x_embed = embed(x).shape

torch.Size([15, 512])

In [None]:
#PE(pos,2i) = sin(pos/100002i/dmodel )
#PE(pos,2i+1) = cos(pos/100002i/dmodel