In [6]:
from transformers import BertForMaskedLM, BertTokenizerFast
import torch

### Tokeniser

We are required to:

* Add special tokens to the start and end of each sentence.
* Pad & truncate all sentences to a single constant length.
* Explicitly differentiate real tokens from padding tokens with the “attention mask”.

#### Special Tokens `[SEP]`

At the end of every sentence, we need to append the special `[SEP]` token.

This token is an artifact of two-sentence tasks, where BERT is given two separate sentences and asked to determine something (e.g., can the answer to the question in sentence A be found in sentence B?).

I am not certain yet why the token is still required when we have only single-sentence input, but it is!

#### `[CLS]`

For classification tasks, we must prepend the special [CLS] token to the beginning of every sentence.

This token has special significance. BERT consists of 12 Transformer layers. Each transformer takes in a list of token embeddings, and produces the same number of embeddings on the output (but with the feature values changed, of course!).

In [7]:
SRC_LANG = 'en'
TOK_DIR = '../../tokenisation'

DATA_DIR = f'{TOK_DIR}/data'
DATA_PATH = f'{DATA_DIR}/iwslt2017-en-zh.{SRC_LANG}'

SPM_DIR = f'{TOK_DIR}/sentencepiece_custom'
SPM_PATH = f'{SPM_DIR}/{SRC_LANG}.model'

MODEL_DIR = '../../models'
BERT_MODEL_PATH = f'{MODEL_DIR}/bert-embed/BPE-bert-{SRC_LANG}'

In [8]:
from bert_tokeniser import BPEBertTokeniser

tokeniser = BPEBertTokeniser(SRC_LANG, model_file=SPM_PATH)

In [9]:
# tokeniser = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [10]:
model = BertForMaskedLM.from_pretrained(BERT_MODEL_PATH, output_hidden_states=True)
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(16384, 256, padding_idx=0)
      (position_embeddings): Embedding(288, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_a

### Architecture Explained

#### Input 

As shown, it takes index mappings for the words and outputs an input layer of size 768. In this case, the tokeniser has a vocab size of 30522.

(pls help)
It also takes positional encodings of 300, meaning it can process sentences of length 300? 
 
It also takes token type embeddings of size 2 - giving 2 possible types of tokens (0 or 1). 0 means padding. In this case, our sentences are padded to a constant length of 300.

#### Output

It will output the token indices, which when fed to the tokeniser will return the tokens. Given that BERTTokenizer uses some form of WordPiece, the tokens with type 1 shld be joined with the previous ones, while tokens with type 0 are start tokens? May need to investigate.

In [11]:
model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(16384, 256, padding_idx=0)
      (position_embeddings): Embedding(288, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_a

Extracting Embeddings

In [12]:
import datasets

dataset = datasets.load_dataset("iwslt2017", "iwslt2017-en-zh")

train, test = dataset['train'], dataset['test']

In [20]:
sent = train[0]['translation']['en']
sent

"Thank you so much, Chris. And it's truly a great honor to have the opportunity to come to this stage twice; I'm extremely grateful."

In [14]:
t = [v['en'] for v in train['translation'][:10]]

from transformers import BatchEncoding

batch = tokeniser(t, padding='max_length', max_length=64, return_special_tokens_mask=True)
v = BatchEncoding(batch, tensor_type='pt')

In [15]:
v['input_ids'].shape

torch.Size([10, 288])

In [16]:

import json
print(tokeniser(sent, padding='max_length', max_length=64, return_special_tokens_mask=True))


# Input Ids: Token IDs
# Token Type IDs: to research, for now all 0
# Attn Mask: all 1s, padding 0

{'input_ids': [1, 652, 48, 124, 397, 16257, 2766, 16259, 76, 61, 16262, 16244, 2764, 6, 560, 4806, 25, 107, 9, 1887, 25, 536, 25, 80, 2192, 3489, 16300, 36, 16262, 16251, 2569, 6318, 16259, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0

In [17]:
tokeniser.pad_token

AttributeError: 'BPEBertTokeniser' object has no attribute 'pad_token'

In [18]:
tokens = tokeniser(sent)

In [19]:
tokens = tokeniser.tokenize(sent)
indices = tokeniser.convert_tokens_to_ids(tokens)
for indice, token in zip(indices, tokens):
    print(indice, token)

AttributeError: 'BPEBertTokeniser' object has no attribute 'tokenize'

In [None]:
indice_tensor = torch.tensor([indices])
segment_tensor = torch.zeros(indice_tensor.shape) + 1

In [None]:
tokeniser(sent, return_special_tokens_mask=True)

In [23]:
model.eval()

with torch.no_grad():
    tokens = tokeniser(sent)
    input_ids = torch.tensor(tokens['input_ids']).reshape((1, -1))
    token_type_ids = torch.tensor([tokens['token_type_ids']])
    attention_mask = torch.tensor([tokens['attention_mask']])

    print(input_ids.shape)

    output = model(input_ids, token_type_ids, attention_mask)
    hidden_states = output.hidden_states

    print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
    layer_i = 0

    print ("Number of batches:", len(hidden_states[layer_i]))
    batch_i = 0

    print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
    token_i = 0

    print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))
 

torch.Size([1, 288])
Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 1
Number of tokens: 288
Number of hidden units: 256


In [None]:
token_embeddings = torch.stack(hidden_states, dim=0)

token_embeddings.size()

torch.Size([13, 1, 288, 256])

In [None]:
token_embeddings = torch.squeeze(token_embeddings, dim=1)

token_embeddings.size()

torch.Size([13, 288, 256])

In [None]:
token_embeddings = token_embeddings.permute(1,0,2)

token_embeddings.size()

torch.Size([288, 13, 256])

#### Append Last 4 Layers

In [None]:
# Stores the token vectors, with shape [22 x 3,072]
token_vecs_cat = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:
    
    # `token` is a [12 x 768] tensor

    # Concatenate the vectors (that is, append them together) from the last 
    # four layers.
    # Each layer vector is 768 values, so `cat_vec` is length 3,072.
    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    
    # Use `cat_vec` to represent `token`.
    token_vecs_cat.append(cat_vec)

print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))


### Sum Last 4 Layers

In [None]:
# Stores the token vectors, with shape [22 x 768]
token_vecs_sum = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

Shape is: 288 x 256


In [None]:
# `hidden_states` has shape [13 x 1 x 22 x 768]

# `token_vecs` is a tensor with shape [22 x 768]
token_vecs = hidden_states[-2][0]

# Calculate the average of all 22 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)

In [None]:
print ("Our final sentence embedding vector of shape:", sentence_embedding.size())

In [None]:
tokenised_text = tokeniser.tokenize(sent)

for id, tok in enumerate(tokenised_text):
    print(id, tok)

In [None]:
print('First 5 vector values for each instance of "to".')
print('')
print("honor to have   ", str(token_vecs_sum[15][:5]))
print("opportunity to come  ", str(token_vecs_sum[19][:5]))
print("come to this   ", str(token_vecs_sum[21][:5]))

First 5 vector values for each instance of "to".

honor to have    tensor([ 0.3083, -1.3135, -0.8791,  2.0045,  0.7318])
opportunity to come   tensor([ 0.7557, -4.7355,  0.4427, -6.3347, -4.0329])
come to this    tensor([ 1.9014, -5.0590,  3.2812, -0.9040,  2.8231])


In [None]:
"""
Cosine Similarity

May wish to test with other examples of ambiguous words ('river bank', 'bank robber')
"""

from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bank
# for to
same_to = 1 - cosine(token_vecs_sum[19], token_vecs_sum[21])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_to)

Vector similarity for  *similar*  meanings:  0.39


In [None]:
import torch.nn as nn
class Embedding(nn.Module):
    def __init__(self, tokeniser=None):
        super().__init__()
        if not tokeniser:
            self.tokeniser = BertTokenizerFast.from_pretrained('bert-base-uncased')
        else:
            self.tokeniser = tokeniser
        self.model = BertForMaskedLM.from_pretrained(
            '../../models/bert-embed/bert', 
            output_hidden_states=True
        )
        self.model.eval()
    def forward(self, sentence: str):
        input_ids = torch.tensor([tokens['input_ids']])
        token_type_ids = torch.tensor([tokens['token_type_ids']])
        attention_mask = torch.tensor([tokens['attention_mask']])
        with torch.no_grad():
            output = model(input_ids, attention_mask, token_type_ids)
            hidden_states = output.hidden_states
            token_embeddings = torch.stack(hidden_states, dim=0)
            token_embeddings = torch.squeeze(token_embeddings, dim=1)
            token_embeddings = token_embeddings.permute(1,0,2)

            """
            TOKEN SUM VEC
            """
            token_vecs_sum = []
            # `token_embeddings` is a [len x 12 x 768] tensor.
            # For each token in the sentence...
            for token in token_embeddings:
                # `token` is a [12 x 768] tensor
                # Sum the vectors from the last four layers.
                sum_vec = torch.sum(token[-4:], dim=0)
                # Use `sum_vec` to represent `token`.
                token_vecs_sum.append(sum_vec)
            token_embedding_output = torch.vstack(token_vecs_sum)
            
            return {
                'Token': token_embedding_output,
            }


In [None]:
custom_bert = Embedding()

output = custom_bert.forward(train[0]['translation']['en'])

output['Token'].shape

### Wrapper over BaseBPETokeniser to output masks similar to BertTokenizerFast

In [None]:
class BPEforBERTTokenizer(object):
    def __init__(self, en_model_file=None, zh_model_file=None):
        self.bpe_tokenizer = BaseBPETokeniser(en_model_file=en_model_file, zh_model_file=zh_model_file)

    def __len__(self):
        """
        Both the english and chinese tokenisers have the same length.
        """
        return len(self.bpe_tokenizer)

    def __call__(self, sent: str, text_target=None, max_len=128, max_zh_len=None):
        out, len_out = self.bpe_tokenizer(sent, text_target=text_target, max_len=max_len, max_zh_len=max_zh_len) #len_out includes SOS and EOS
        out['token_type_ids'] = [0] * max_len
        out['special_tokens_mask'] = [1] + [0] * (len_out - 2) + [1] * (max_len - len_out + 1)
        out['attention_mask'] = [1] * len_out + [0] * (max_len - len_out)
        return out

In [None]:
tokeniser_bert = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokeniser_bert(dataset['train']['translation'][0]['en']).keys()

In [None]:
from sentencepiece_custom.tokeniser import BaseBPETokeniser

tokeniser = BaseBPETokeniser(zh_model_file="./sentencepiece_custom/zh.model", en_model_file="./sentencepiece_custom/en.model")
(tokeniser(dataset['train']['translation'][0]['en'], max_len=288)[0]['input_ids'])

In [None]:
print(tokeniser.get_special_ids('en'))
print(tokeniser.get_special_ids('zh'))

# returns (UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX)

In [None]:
tokeniser2 = BPEforBERTTokenizer(zh_model_file="./sentencepiece_custom/zh.model", en_model_file="./sentencepiece_custom/en.model")
test = tokeniser2(dataset['train']['translation'][0]['en'], max_len=288)
import numpy as np
for key in test.keys():
  curr = np.array(test[key])
  print(key)
  print(len(curr))
  print(curr)

  print(len(curr[curr == 3]))
  print(len(curr[curr == 1]))

In [None]:
print(len(tokeniser2))

In [None]:
print(dataset['train']['translation'][0])