In [19]:
import os
import sys
import json
import time
import math
import random
import datetime
from pathlib import Path
import wget

import argparse
import dill as pickle
from tqdm import tqdm
import urllib

import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import grad
from torch import Tensor
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataset import random_split

import transformers
from transformers import BertModel
from transformers import BertConfig
from transformers import BertTokenizer

import datasets
import tokenizers

In [20]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()
        
    def forward(self, query, key, value, mask=None):
        batch_size, num_head, sequence_length, size_per_head = key.size()
        
        # matmul between query and key
        key =  key.view(batch_size, num_head, size_per_head, sequence_length)
        
        # scale
        attention_score = torch.matmul(query,key) / math.sqrt(size_per_head)
        
        # applying mask(opt) : 0s are where we apply masking
        if mask is not None:
            mask = mask.unsqueeze(1) # (batch_size, 1, sequence_length, sequence_length)
            attention_score = attention_score.masked_fill(mask==0,-1e9)
        
        # applying softmax
        attention_score = F.softmax(attention_score, dim=-1)
        
        # matmul between attention_score and value
        return torch.matmul(attention_score,value), attention_score

class MultiHeadAttention(nn.Module):
    def __init__(self, model_dim, key_dim, value_dim, num_head):
        super(MultiHeadAttention, self).__init__()
        self.model_dim = model_dim
        self.key_dim = key_dim
        self.value_dim = value_dim
        self.num_head = num_head
        
        self.Wq = nn.Linear(model_dim, key_dim)
        self.Wk = nn.Linear(model_dim, key_dim)
        self.Wv = nn.Linear(model_dim, value_dim)
        self.attention = ScaledDotProductAttention()
        self.Wo = nn.Linear(model_dim, model_dim)
        
    def forward(self, query, key, value, mask=None):
        # linearly project queries, key and values
        prj_query = self.Wq(query)
        prj_key = self.Wk(key)
        prj_value = self.Wv(value)
        
        # split prj_query, prj_key, prj_value into multi head
        multihead_query = self.multihead_split(prj_query)
        multihead_key = self.multihead_split(prj_key)
        multihead_value = self.multihead_split(prj_value)
        
        # perform Scaled Dot Product Attention
        attention_output, attention_score = self.attention(multihead_query, multihead_key, multihead_value, mask=mask)
        
        # concat output back to 3-dimensional tensor of (batch_size, sequence_length, hidden_size)
        output = self.multihead_concat(attention_output)
        output = self.Wo(output)
        
        return output
    
    def multihead_split(self, tensor):
        batch_size, sequence_length, hidden_size = tensor.size()
        
        size_per_head = hidden_size // self.num_head
        # (batch_size, num_head, sequence_length, size_per_head)
        return tensor.view(batch_size, self.num_head, sequence_length, size_per_head)
    
    def multihead_concat(self, tensor):
        batch_size, num_head, sequence_length, size_per_head = tensor.size()
        
        hidden_size = num_head * size_per_head
        return tensor.view(batch_size,sequence_length,hidden_size)

class FeedForward(nn.Module):
    def __init__(self, model_dim, hidden_dim, drop_prob):
        super(FeedForward,self).__init__()
        self.model_dim = model_dim
        self.hidden_dim = hidden_dim
        self.drop_prob = drop_prob
        
        self.linearlayer1 = nn.Linear(model_dim, hidden_dim)
        self.linearlayer2 = nn.Linear(hidden_dim, model_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(drop_prob)
        
    def forward(self, tensor):
        tensor = self.dropout(self.relu(self.linearlayer1(x)))
        return self.linearlayer2(tensor)

In [21]:
class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, model_dim):
        super(TokenEmbedding, self).__init__(vocab_size, model_dim, padding_idx=1)

class PositionalEncoding(nn.Module):
    def __init__(self, model_dim, max_len, device):
        super(PositionalEncoding, self).__init__()
        
        self.encoding = torch.zeros(max_len, model_dim, device=device)
        self.encoding.requires_grad = False
        
        pos = torch.arange(0,max_len,device=device).float().unsqueeze(dim=1)
        _2i = torch.arange(0,model_dim,step=2,device=device).float()
        
        # self.encoding = (sequence_length, hidden_size)
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i/model_dim)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i/model_dim)))
        
    def forward(self, tensor):
        batch_size, sequence_length = tensor.size()
        
        # (sequence_length, hidden_size)
        return self.encoding[:sequence_length, :]

class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, model_dim, max_len, drop_prob, device):
        super(TransformerEmbedding,self).__init__()
        self.tok_emb = TokenEmbedding(vocab_size, model_dim)
        self.pos_emb = PositionalEncoding(model_dim, max_len, device)
        self.drop_out = nn.Dropout(drop_prob)
    
    def forward(self, tensor):
        tok_emb = self.tok_emb(tensor)
        pos_emb = self.pos_emb(tensor)
        
        return self.drop_out(tok_emb+pos_emb)

In [22]:
class EncoderLayer(nn.Module):
    def __init__(self, model_dim, key_dim, value_dim, hidden_dim, num_head, drop_prob):
        super(EncoderLayer,self).__init__()
        
        self.attention = MultiHeadAttention(model_dim, key_dim, value_dim, num_head)
        self.normalization1 = nn.LayerNorm(model_dim)
        self.dropout1 = nn.Dropout(drop_prob)
        
        self.ffn = FeedForward(model_dim, hidden_dim, drop_prob)
        self.normalization2 = nn.LayerNorm(model_dim)
        self.dropout2 = nn.Dropout(drop_prob)
        
    def forward(self, tensor, source_mask):
        residual = tensor
        tensor = self.attention(query=tensor,key=tensor,value=tensor,mask=source_mask)
        tensor = self.dropout1(self.normalization1(tensor+residual))
        
        residual = tensor
        tensor = self.ffn(tensor)
        tensor = self.dropout2(self.normalization2(tensor+residual))
        
        return tensor

In [29]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, max_len, 
                    model_dim, key_dim, value_dim, hidden_dim, 
                    num_head, num_layer, drop_prob, device):
        super(Encoder,self).__init__()
        self.embedding = TransformerEmbedding(vocab_size, model_dim, max_len, drop_prob, device)
        
        self.layers = nn.ModuleList([EncoderLayer(model_dim, key_dim, value_dim, 
                                                  hidden_dim, num_head, 
                                                  drop_prob) for _ in range(num_layer)])
        
    def forward(self, input_ids, token_type_ids, attention_mask):
        input_emb = self.embedding(input_ids)
        encoder_output = input_emb
        
        for layer in self.layers:
            encoder_output = layer(encoder_output, attention_mask)
        
        return encoder_output

class NaturalLanguageUnderstandingHead(nn.Module):
    def __init__(self, vocab_size, model_dim):
        super(NaturalLanguageUnderstandingHead,self).__init__()
        self.linear_layer = nn.Linear(model_dim, vocab_size)
    
    def forward(self, encoder_output):
        return F.log_softmax(self.linear_layer(encoder_output),dim=-1)

class BERTModel(nn.Module):
    def __init__(self, pad_idx, mask_idx, cls_idx, sep_idx,
                vocab_size, model_dim, key_dim, value_dim, hidden_dim, 
                num_head, num_layer, max_len, drop_prob, device):
        super(BERTModel, self).__init__()
        self.pad_idx = pad_idx
        self.mask_idx = mask_idx
        self.cls_idx = cls_idx
        self.sep_idx = sep_idx
        self.device = device

        self.Encoder = Encoder(vocab_size, max_len, model_dim, key_dim, value_dim, hidden_dim, num_head, num_layer, drop_prob, device)
        self.NLUHead = NaturalLanguageUnderstandingHead(vocab_size, model_dim)

    def forward(self, input_ids, token_type_ids, attention_mask):
        encoder_output = self.Encoder(input_ids, token_type_ids, attention_mask)
        output = self.NLUHead(encoder_output)

        return self.NLUHead(encoder_output)
    
    # applying mask(opt) : 0s are where we apply masking
    def generate_padding_mask(self, query, key, query_pad_type=None, key_pad_type=None):
        # query = (batch_size, query_length)
        # key = (batch_size, key_length)
        query_length = query.size(1)
        key_length = key.size(1)
        
        # convert query and key into 4-dimensional tensor
        # query = (batch_size, 1, query_length, 1) -> (batch_size, 1, query_length, key_length)
        # key = (batch_size, 1, 1, key_length) -> (batch_size, 1, query_length, key_length)
        query = query.ne(query_pad_idx).unsqueeze(1).unsqueeze(3)
        query = query.repeat(1,1,1,key_length)
        key = key.ne(key_pad_idx).unsqueeze(1).unsqueeze(2)
        key = key.repeat(1,1,query_length,1)
        
        # create padding mask with key and query
        mask = key & query
        
        return mask

In [30]:
def build_model(pad_idx, mask_idx, cls_idx, sep_idx, unk_idx,
                vocab_size, model_dim, key_dim, value_dim, hidden_dim, 
                num_head, num_layer, max_len, drop_prob, device):
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = BERTModel(pad_idx, mask_idx, cls_idx, sep_idx,
                vocab_size, model_dim, key_dim, value_dim, hidden_dim, 
                num_head, num_layer, max_len, drop_prob, device)
    
    return model.cuda() if torch.cuda.is_available() else model

In [33]:
tmp_model = build_model(0,103,101,102,100,30000,
                       512,64,64,2048,8,12,1024,0.1,'cuda:0')

params = list(tmp_model.parameters())
print("The number of parameters:",sum([p.numel() for p in tmp_model.parameters() if p.requires_grad]), "elements")

The number of parameters: 60304944 elements
