In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, random_split, DataLoader

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
BATCH_SIZE = 64
NUM_EPOCHES = 2
LR = 1e-5

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, max_seq_len=2000, embedding_size=300, dropout=0.2, device="cpu"):
        super(PositionalEncoding, self).__init__()
        import math

        self.dropout = nn.Dropout(p=dropout)
        
        self.pe_matrix = torch.zeros(max_seq_len, embedding_size).to(device)
        
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_size, 2).float() * (-math.log(10000.0) / embedding_size))
        self.pe_matrix[:, 0::2] = torch.sin(position * div_term)
        self.pe_matrix[:, 1::2] = torch.cos(position * div_term)

        self.pe_matrix = self.pe_matrix.unsqueeze(1)
        
    def forward(self, x):        
        x = x + self.pe_matrix[:x.size(0), :]
        x = self.dropout(x)
        return x

In [5]:
class Transformer(nn.Module):
    def __init__(self, s_vocab_size, t_vocab_size, embed_size, num_head, num_ff, encode_layers, decode_layers, dropout=0.2, device="cpu"):
        super(Transformer, self).__init__()
        
        self.s_vocab_size = s_vocab_size
        self.t_vocab_size = t_vocab_size
        self.embed_size = embed_size
        self.num_head = num_head
        self.num_ff = num_ff
        self.encoder_num_layers = encode_layers
        self.decoder_num_layers = decode_layers
        self.dropout = dropout
        self.device = device
        
        self.encoder_embed = nn.Embedding(self.s_vocab_size, embed_size)
        self.decoder_embed = nn.Embedding(self.t_vocab_size, embed_size)
        self.encoder_positional_encoding = PositionalEncoding(self.s_vocab_size, self.embed_size, device=device)
        self.decoder_positional_encoding = PositionalEncoding(self.t_vocab_size, self.embed_size, device=device)
        
        self.encoder_layer = nn.TransformerEncoderLayer(self.embed_size, self.num_head, self.num_ff, dropout=self.dropout)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, self.encoder_num_layers)
        
        self.decoder_layer = nn.TransformerDecoderLayer(self.embed_size, self.num_head, self.num_ff, dropout=self.dropout)
        self.decoder = nn.TransformerDecoder(self.decoder_layer, self.decoder_num_layers)
        
        self.final = nn.Linear(self.embed_size, self.t_vocab_size)
        self.log_softmax = nn.LogSoftmax()
        
    def forward(self, x, y):
        x = self.encoder_embed(x) * math.sqrt(self.embed_size)
        print("x - Embedding", x.size())
        y = self.decoder_embed(y) * math.sqrt(self.embed_size)
        print("y - Embedding", y.size())
        
        x = self.encoder_positional_encoding(x)
        print("x - Encoder Positional Encoding", x.size())
        y = self.decoder_positional_encoding(y)
        print("y - Decoder Positional Encoding", y.size())
        
        memory = self.encoder(x)
        print("memory size: ", memory.size())
        out = self.decoder(y, memory)
        print("out decoder size: ", out.size())
        x = self.final(out)
        print("final output size", x.size())
        x = self.log_softmax(x)
        return x

In [6]:
source_vocab_size = 20000 #len(TEXT.vocab.stoi)
target_vocab_size = 5 #len(LABEL.vocab.stoi)
embed_size = 300
num_head = 3
num_ff = 300
encoder_layers = 1
decoder_layers = 1

In [7]:
model = Transformer(source_vocab_size, target_vocab_size, embed_size, num_head, num_ff, encoder_layers, decoder_layers, device=device)
model.to(device)

Transformer(
  (encoder_embed): Embedding(20000, 300)
  (decoder_embed): Embedding(5, 300)
  (encoder_positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (decoder_positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): _LinearWithBias(in_features=300, out_features=300, bias=True)
    )
    (linear1): Linear(in_features=300, out_features=300, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (linear2): Linear(in_features=300, out_features=300, bias=True)
    (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.2, inplace=False)
    (dropout2): Dropout(p=0.2, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAt

In [8]:
#for batch_idxx, (x, y) in enumerate(train_iterator):
#    x = x.reshape(BATCH_SIZE, -1)
#    y = y.reshape(BATCH_SIZE, -1)
#    print(x)
#    print(y)
#    print(x.size())
#    print(y.size())
#    out = model(x.to(device), y.to(device))
#    print(out)
#    print(out.size())
#    break