In [3]:
import torch
import re 
from random import * 
import numpy as np 
import torch 
import torch.nn as nn 
import torch.optim as optim

This code originated from here: https://towardsdatascience.com/deep-dive-into-the-code-of-bert-model-9f618472353e

In [None]:

class BertTokenizer(): 
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(
        self, 
        vocab_file, 
        do_lower_case = True, 
        do_basic_tokenize= True,
        never_split=None,
        unk_token="[UNK]", 
        sep_token="[SEP]", 
        pad_token="[PAD]", 
        cls_token="[CLS]",
        mask_token="[MASK]", 
        tokenize_chinese_chars=True, 
        strip_accents=None,
        **kwargs
    ): 
        super().__init__(
            do_lower_case=do_lower_case, 
            do_basic_tokenize=do_basic_tokenize, 
            never_split=never_split, 
            unk_token=unk_token,
            sep_token=sep_token, 
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

        if not os.path.isfile(vocab_file): 
            raise ValueError(
                "cant find file"
            )

        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)

        output_tokens = []
        for token in whitespace_tokenize(text): 
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word: 
                output_tokens.append(self.unk_token)
                continue 
        
            is_bad = False
            start = 0 
            sub_tokens = []
            while start < len(chars): 
                end = len(chars)
                cur_substr = None 
                while start < end: 
                    substr = "".join(chars[start:end])
                    if start > 0: 
                        substr = "##" + substr
                    if substr in self.vocab: 
                        cur_substr = substr
                        break
                    end -= 1 

                if cur_substr is None: 
                    is_bad = True 
                    break 

                sub_tokens.append(cur_substr)
                start = end 

            if is_bad: 
                output_tokens.append(self.unk_token)
            else: 
                output_tokens.extend(sub_tokens)
        return output_tokens 

In [11]:
from transformers import BertTokenizer

max_length = 256 
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")


In [36]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
inputs_tests = tokenizer(newsgroups_train['data'][:3], truncation=True, padding=True, max_length=max_length)

In [43]:
tokenizer.tokenize("hello world yo. my name is jy jeff")

['hello',
 'world',
 'yo',
 '.',
 'my',
 'name',
 'is',
 'j',
 '##y',
 'j',
 '##ef',
 '##f']

In [40]:
inputs_tests['attention_mask'][0]

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [None]:
from sklearn.datasets import fetch_20newsgroups 
newsgroups_train = fetch_20newsgroups(subset='train')

max_length = 256

inputs_tests = tokenizer(newsgroups_train['data'][:3], truncation=True, padding=True, max_length=max_length, return_tensors='pt')



In [4]:
text = (
        'Hello, how are you? I am Romeo.\n'
        'Hello, Romeo My name is Juliet. Nice to meet you.\n'
        'Nice meet you too. How are you today?\n'
        'Great. My baseball team won the competition.\n'
        'Oh Congratulations, Juliet\n'
        'Thanks you Romeo'
    )

In [7]:
sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n')
word_list = list(set(" ".join(sentences).split()))
word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}

for i, w in enumerate(word_list): 
    word_dict[w] = i + 4
number_dict = {i: w for i, w in enumerate(word_dict)}
vocab_size = len(word_dict)

token_list = list()
for sentence in sentences: 
    arr = [word_dict[s] for s in sentence.split()]
    token_list.append(arr)

In [8]:
token_list

[[19, 28, 10, 22, 14, 9, 26],
 [19, 26, 25, 21, 6, 15, 23, 18, 8, 22],
 [23, 8, 22, 13, 28, 10, 22, 16],
 [7, 25, 4, 12, 11, 20, 5],
 [24, 27, 15],
 [17, 22, 26]]

In [9]:
maxlen = 30 
batch_size = 6 
max_pred = 5 
n_layers = 6 
n_heads = 12
d_model = 768 
d_ff = 768 
d_k = d_v = 64 
n_segments = 2

In [None]:
def make_batch(): 
    batch = []
    positive = negative = 0 
    while positive != batch_size/2 or negative != batch_size/2: 
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))
        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]
        

In [None]:
class BERT(nn.Module): 
    def __init__(self): 
        super(BERT, self).__init__()
        self.embedding = Embedding() 
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, d_model)
        self.activ1 = nn.Tanh()
        self.linear = nn.Linear(d_model, d_model)
        self.activ2 = gelu 
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, 2)
        embed_weight = self.embedding.tok_embed.weight 
        n_vocab, n_dim = embed_weight.size() 
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        self.decoder.weight = embed_weight
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))
    
    def forward(self, input_ids, segment_ids, masked_pos): 
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
        for layer in self.layers: 
            output, enc_self_attn = layer(output, enc_self_attn_mask)

        h_pooled = self.activ1(self.fc(output[:, 0]))
        logits_clsf = self.classifier(h_pooled)

        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1))
        h_masked = torch.gather(output, 1, masked_pos)
        h_masked = self.norm(self.activ2(self.linear(h_masked)))
        logits_lm = self.decoder(h_masked) + self.decoder_bias

        return logits_lm, logits_clsf 
