In [1]:
import spacy
import time
import random
import math
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
german_tokenizer = spacy.load("de_core_news_sm")
english_tokenizer = spacy.load("en_core_web_sm")

In [4]:
def tokenize_german(text):
    return [token.text for token in german_tokenizer.tokenizer(text)]
def tokenize_english(text):
    return [token.text for token in english_tokenizer.tokenizer(text)]

In [5]:
SRC = Field(tokenize=tokenize_german,init_token='<sos>',eos_token='<sos>',lower=True,batch_first=True)
TRG = Field(tokenize=tokenize_english, init_token='<sos>',eos_token='<sos>',lower=True,batch_first=True)

In [6]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (SRC, TRG))

In [7]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [8]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = BATCH_SIZE,
     device = device)

In [12]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim,dropout, device,max_len=100):
        super().__init__()
        self.device = device
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_len, hid_dim)
        self.droput =  dropout
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
    def forward(self,src,src_mask):
        #src=[batch_size, src_len]
        #src_mask = [batch_size, 1, 1, src_len]
        batch_size = src[0]
        src_len = src[1]
        pos = torch.arange(0,src_len).unsqueeze(0).repeat(batch_size,1).to(self.device)
        #pos=[batch_size, src_len]
        src = self.dropout((self.tok_embedding(src)*self.scale)+ self.pos_embedding(pos))
        #src=[batch_size, src_len, hid_dim]
        for layer in self.layers:
            src = layer(src,src_mask)
        return src

In [14]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads,dropout, device):
        super().__init__()
        assert hid_dim % n_heads == 0
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim//n_heads

        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        self.droput = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
    
    def forward(self, query, key,value, mask = None):
        #query = [batch_size, query_len,hid_dim]
        #key = [batch_size, key_len,hid_dim]
        #value = [batch_size, value_len,hid_dim]
        
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        #Q = [batch_size, query_len,hid_dim]
        #K = [batch_size, key_len,hid_dim]
        #V = [batch_size, value_len,hid_dim]
        
        batch_size = query.shape[0]
        Q = Q.view(batch_size,-1, self.n_heads,self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size,-1, self.n_heads,self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size,-1, self.n_heads,self.head_dim).permute(0, 2, 1, 3)
        
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
        
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        #energy = [batch_size, n_heads, query_len, key_len]
        
        if mask is not None:
            energy = energy.masked_filled(mask==0,-1e10)
            
        attention = torch.softmax(energy, dim=-1)
        #attention = [batch_size, n_heads, query_len, key_len]
        
        x = torch.matmul(self.dropout(attention),V)
        #x = [batch_size, n_heads, query_len, head_dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        #x = [batch size, query_len, n_heads, head_dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        #x = [batch size, query len, hid dim]
        
        x = self.fc_o(x)
        
        #x = [batch size, query len, hid dim]
        
        return x, attention

In [15]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        #x = [batch_size, seq_len, hid_dim]
        x = self.dropout(torch.relu(self.fc_1))
        #x = [batch_size, seq_len, pf_dim]   
        x= self.fc_2(x)
        #x = [batch_size, seq_len, hid_dim]
        return x