In [3]:
from transformers import BertForMaskedLM, BertTokenizerFast
import torch

  from .autonotebook import tqdm as notebook_tqdm


### Tokeniser

We are required to:

* Add special tokens to the start and end of each sentence.
* Pad & truncate all sentences to a single constant length.
* Explicitly differentiate real tokens from padding tokens with the “attention mask”.

#### Special Tokens `[SEP]`

At the end of every sentence, we need to append the special `[SEP]` token.

This token is an artifact of two-sentence tasks, where BERT is given two separate sentences and asked to determine something (e.g., can the answer to the question in sentence A be found in sentence B?).

I am not certain yet why the token is still required when we have only single-sentence input, but it is!

#### `[CLS]`

For classification tasks, we must prepend the special [CLS] token to the beginning of every sentence.

This token has special significance. BERT consists of 12 Transformer layers. Each transformer takes in a list of token embeddings, and produces the same number of embeddings on the output (but with the feature values changed, of course!).

In [4]:
tokeniser = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [25]:
model = BertForMaskedLM.from_pretrained('../models/bert-embed/bert', output_hidden_states=True)

### Architecture Explained

#### Input 

As shown, it takes index mappings for the words and outputs an input layer of size 768. In this case, the tokeniser has a vocab size of 30522.

(pls help)
It also takes positional encodings of 300, meaning it can process sentences of length 300? 
 
It also takes token type embeddings of size 2 - giving 2 possible types of tokens (0 or 1). 0 means padding. In this case, our sentences are padded to a constant length of 300.

#### Output

It will output the token indices, which when fed to the tokeniser will return the tokens. Given that BERTTokenizer uses some form of WordPiece, the tokens with type 1 shld be joined with the previous ones, while tokens with type 0 are start tokens? May need to investigate.

In [7]:
model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(300, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

Extracting Embeddings

In [8]:
import datasets

dataset = datasets.load_dataset("iwslt2017", "iwslt2017-en-zh")

train, test = dataset['train'], dataset['test']

In [9]:
sent = train[0]['translation']['en']
sent

"Thank you so much, Chris. And it's truly a great honor to have the opportunity to come to this stage twice; I'm extremely grateful."

In [10]:
tokens = tokeniser.tokenize(sent)
indices = tokeniser.convert_tokens_to_ids(tokens)
for indice, token in zip(indices, tokens):
    print(indice, token)

4067 thank
2017 you
2061 so
2172 much
1010 ,
3782 chris
1012 .
1998 and
2009 it
1005 '
1055 s
5621 truly
1037 a
2307 great
3932 honor
2000 to
2031 have
1996 the
4495 opportunity
2000 to
2272 come
2000 to
2023 this
2754 stage
3807 twice
1025 ;
1045 i
1005 '
1049 m
5186 extremely
8794 grateful
1012 .


In [11]:
indice_tensor = torch.tensor([indices])
segment_tensor = torch.zeros(indice_tensor.shape) + 1

In [14]:
tokeniser(sent, return_special_tokens_mask=True)

{'input_ids': [101, 4067, 2017, 2061, 2172, 1010, 3782, 1012, 1998, 2009, 1005, 1055, 5621, 1037, 2307, 3932, 2000, 2031, 1996, 4495, 2000, 2272, 2000, 2023, 2754, 3807, 1025, 1045, 1005, 1049, 5186, 8794, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}

In [29]:
model.eval()

with torch.no_grad():
    tokens = tokeniser(sent)
    input_ids = torch.tensor([tokens['input_ids']])
    token_type_ids = torch.tensor([tokens['token_type_ids']])
    attention_mask = torch.tensor([tokens['attention_mask']])

    output = model(input_ids, token_type_ids, attention_mask)
    hidden_states = output.hidden_states

    print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
    layer_i = 0

    print ("Number of batches:", len(hidden_states[layer_i]))
    batch_i = 0

    print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
    token_i = 0

    print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))
 

Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 1
Number of tokens: 34
Number of hidden units: 768


In [30]:
token_embeddings = torch.stack(hidden_states, dim=0)

token_embeddings.size()

torch.Size([13, 1, 34, 768])

In [31]:
token_embeddings = torch.squeeze(token_embeddings, dim=1)

token_embeddings.size()

torch.Size([13, 34, 768])

In [32]:
token_embeddings = token_embeddings.permute(1,0,2)

token_embeddings.size()

torch.Size([34, 13, 768])

#### Append Last 4 Layers

In [33]:
# Stores the token vectors, with shape [22 x 3,072]
token_vecs_cat = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:
    
    # `token` is a [12 x 768] tensor

    # Concatenate the vectors (that is, append them together) from the last 
    # four layers.
    # Each layer vector is 768 values, so `cat_vec` is length 3,072.
    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    
    # Use `cat_vec` to represent `token`.
    token_vecs_cat.append(cat_vec)

print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))


Shape is: 34 x 3072


### Sum Last 4 Layers

In [34]:
# Stores the token vectors, with shape [22 x 768]
token_vecs_sum = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

Shape is: 34 x 768


In [35]:
# `hidden_states` has shape [13 x 1 x 22 x 768]

# `token_vecs` is a tensor with shape [22 x 768]
token_vecs = hidden_states[-2][0]

# Calculate the average of all 22 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)

In [36]:
print ("Our final sentence embedding vector of shape:", sentence_embedding.size())

Our final sentence embedding vector of shape: torch.Size([768])


In [38]:
tokenised_text = tokeniser.tokenize(sent)

for id, tok in enumerate(tokenised_text):
    print(id, tok)

0 thank
1 you
2 so
3 much
4 ,
5 chris
6 .
7 and
8 it
9 '
10 s
11 truly
12 a
13 great
14 honor
15 to
16 have
17 the
18 opportunity
19 to
20 come
21 to
22 this
23 stage
24 twice
25 ;
26 i
27 '
28 m
29 extremely
30 grateful
31 .


In [39]:
print('First 5 vector values for each instance of "to".')
print('')
print("honor to have   ", str(token_vecs_sum[15][:5]))
print("opportunity to come  ", str(token_vecs_sum[19][:5]))
print("come to this   ", str(token_vecs_sum[21][:5]))

First 5 vector values for each instance of "to".

honor to have    tensor([-6.6135, -0.3495,  5.6792, -4.1039, -1.8736])
opportunity to come   tensor([-0.7312, -4.5288,  4.5461,  0.7789, -8.9607])
come to this    tensor([-0.5958, -5.9333,  1.3312, -7.3594, -5.6697])


In [41]:
"""
Cosine Similarity

May wish to test with other examples of ambiguous words ('river bank', 'bank robber')
"""

from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bank
# for to
same_to = 1 - cosine(token_vecs_sum[19], token_vecs_sum[21])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_to)

Vector similarity for  *similar*  meanings:  0.48


In [48]:
import torch.nn as nn
class Embedding(nn.Module):
    def __init__(self, tokeniser=None):
        super().__init__()
        if not tokeniser:
            self.tokeniser = BertTokenizerFast.from_pretrained('bert-base-uncased')
        else:
            self.tokeniser = tokeniser
        self.model = BertForMaskedLM.from_pretrained(
            '../models/bert-embed/bert', 
            output_hidden_states=True
        )
        self.model.eval()
    def forward(self, sentence: str):
        input_ids = torch.tensor([tokens['input_ids']])
        token_type_ids = torch.tensor([tokens['token_type_ids']])
        attention_mask = torch.tensor([tokens['attention_mask']])
        with torch.no_grad():
            output = model(input_ids, attention_mask, token_type_ids)
            hidden_states = output.hidden_states
            token_embeddings = torch.stack(hidden_states, dim=0)
            token_embeddings = torch.squeeze(token_embeddings, dim=1)
            token_embeddings = token_embeddings.permute(1,0,2)

            """
            SENTENCE Vec
            """
            token_vecs = hidden_states[-2][0]
            # Calculate the average of all token vectors.
            sentence_embedding_output = torch.mean(token_vecs, dim=0)

            """
            TOKEN SUM VEC
            """
            token_vecs_sum = []
            # `token_embeddings` is a [len x 12 x 768] tensor.
            # For each token in the sentence...
            for token in token_embeddings:
                # `token` is a [12 x 768] tensor
                # Sum the vectors from the last four layers.
                sum_vec = torch.sum(token[-4:], dim=0)
                # Use `sum_vec` to represent `token`.
                token_vecs_sum.append(sum_vec)
            token_embedding_output = torch.vstack(token_vecs_sum)
            
            return {
                'Token': token_embedding_output,
                'Sentence': sentence_embedding_output
            }


In [51]:
custom_bert = Embedding()

output = custom_bert.forward(train[0]['translation']['en'])

output['Token'].shape, output['Sentence'].shape

(torch.Size([34, 768]), torch.Size([768]))