This notebook explores adding pseudoword embeddings as new embeddings to a BERT model:

In [186]:
from transformers import BertForMaskedLM, BertTokenizer
import torch
import numpy as np

Load the pseudoword embeddings:

In [187]:
pseudowords = np.load("../out/pseudowords-test.npy")
pseudowords

array([[ 1.4791993 ,  0.6717983 ,  0.34754717, ..., -0.83745944,
         3.1185284 ,  0.02498921],
       [-1.1472282 ,  0.6415113 , -2.1778345 , ...,  1.7338743 ,
        -1.3341838 ,  2.2364955 ],
       [ 1.2592171 ,  1.4044372 ,  0.7233554 , ..., -0.6436405 ,
         2.5080867 ,  0.63867843],
       ...,
       [ 0.7099659 ,  1.7526314 , -0.54122496, ...,  1.9021684 ,
        -3.6823387 ,  0.61035985],
       [ 0.5576828 , -1.8875732 ,  0.34652272, ...,  7.455317  ,
         0.824571  ,  3.4515877 ],
       [ 0.60624564,  2.9930663 , -0.77594584, ..., -1.3732387 ,
        -2.6446538 ,  2.951031  ]], dtype=float32)

Save the new token names:

In [188]:
import json

with open("../libs/pwibm/data/queries/single_target/MaPP_all.txt") as json_file:
    data = json.load(json_file)

new_tokens = [d["query"].split()[d["query_idx"]] for d in data]
token_counts = {}
bert_tokens = []

for token in new_tokens:
    # Check if the element is already in the dictionary
    if token in token_counts:
        # Increment the count for this element
        token_counts[token] += 1
    else:
        # Initialize the count for this element to 1
        token_counts[token] = 1

    # Append the element with its count to the output list
    bert_tokens.append(f"{token}{token_counts[token]}")

bert_tokens

['in1',
 'in2',
 'in3',
 'in4',
 'in5',
 'in6',
 'in7',
 'in8',
 'in9',
 'in10',
 'for1',
 'for2',
 'for3',
 'for4',
 'for5',
 'for6',
 'for7',
 'for8',
 'for9',
 'for10',
 'for11',
 'for12',
 'for13',
 'for14',
 'for15',
 'started1',
 'started2',
 'started3',
 'started4',
 'started5',
 'started6',
 'started7',
 'started8',
 'started9',
 'started10',
 'had1',
 'had2',
 'had3',
 'had4',
 'had5',
 'had6',
 'had7',
 'had8',
 'had9',
 'had10',
 'had11',
 'had12',
 'had13',
 'had14',
 'had15',
 'had16',
 'had17',
 'had18',
 'had19',
 'had20',
 'had21',
 'had22',
 'had23',
 'had24',
 'had25',
 'about1',
 'about2',
 'about3',
 'about4',
 'about5',
 'about6',
 'about7',
 'about8',
 'with1',
 'with2',
 'with3',
 'with4',
 'with5',
 'with6',
 'with7',
 'with8',
 'with9',
 'with10',
 'with11',
 'on1',
 'on2',
 'on3',
 'on4',
 'on5',
 'on6',
 'on7',
 'on8',
 'on9',
 'on10',
 'run1',
 'run2',
 'run3',
 'run4',
 'run5',
 'run6',
 'run7',
 'run8']

Load the vanilla BERT model:

In [189]:
model = BertForMaskedLM.from_pretrained('bert-base-cased', return_dict=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model.bert.embeddings.word_embeddings

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(28996, 768, padding_idx=0)

Add to existing embeddings:

In [190]:
combined_embeddings = torch.cat((model.bert.embeddings.word_embeddings.weight, torch.tensor(pseudowords)), dim=0)
model.bert.embeddings.word_embeddings = torch.nn.Embedding.from_pretrained(combined_embeddings)
model.bert.embeddings.word_embeddings

Embedding(29093, 768)

Add to existing tokens:

In [191]:
tokenizer.add_tokens(bert_tokens)
model.resize_token_embeddings(len(tokenizer))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 29093. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(29093, 768)

Try it with an example:

In [192]:
tokenized_text = tokenizer.tokenize("[CLS] I started8 my [MASK]. [SEP]")
masked_index = tokenized_text.index("[MASK]")
tokenized_text

['[CLS]', 'I', 'started8', 'my', '[MASK]', '.', '[SEP]']

Convert the tokens to indices:

In [193]:
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
input_ids = torch.tensor([input_ids])
input_ids

tensor([[  101,   146, 29028,  1139,   103,   119,   102]])

Predict the token:

In [194]:
with torch.no_grad():
    outputs = model(input_ids)
    predictions = outputs.logits

predicted_token_id = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_token_id])[0]

predicted_token

'for3'

Predict the top 100 tokens:

In [195]:
top_k = 100
predicted_token_ids = torch.topk(predictions[0, masked_index], top_k).indices
predicted_token_probs = torch.topk(predictions[0, masked_index], top_k).values

# Convert the predicted token IDs back to tokens
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)

# Print the top 5 predictions and their probabilities
for token, prob in zip(predicted_tokens, predicted_token_probs):
    print(token, prob)

for3 tensor(174.5336)
with11 tensor(147.5389)
run3 tensor(143.0515)
in8 tensor(133.6161)
on10 tensor(115.6954)
for14 tensor(111.0461)
with5 tensor(110.1141)
for11 tensor(106.6162)
with9 tensor(106.5543)
for4 tensor(101.0922)
started10 tensor(94.2033)
with4 tensor(91.6169)
in7 tensor(84.1162)
run4 tensor(82.7684)
for2 tensor(80.3614)
for1 tensor(78.2343)
on5 tensor(77.7789)
on6 tensor(77.5200)
started7 tensor(75.1326)
with10 tensor(74.9434)
about7 tensor(71.8192)
had18 tensor(71.6250)
had14 tensor(71.1675)
with6 tensor(69.6956)
had8 tensor(69.6460)
with7 tensor(69.3532)
run2 tensor(69.2648)
with8 tensor(63.9455)
had12 tensor(63.8302)
for15 tensor(58.1539)
had20 tensor(53.7997)
had17 tensor(48.5922)
on8 tensor(47.8732)
about8 tensor(44.9220)
on7 tensor(43.3684)
run8 tensor(42.9613)
had11 tensor(42.6453)
in10 tensor(42.6311)
in9 tensor(41.5016)
started6 tensor(33.6866)
about5 tensor(33.0514)
had6 tensor(29.8570)
had15 tensor(27.9974)
had10 tensor(24.6933)
started8 tensor(23.1316)
in2 tens

Predict the most probable word that is not part of the new embeddings:

In [196]:
predicted_token_ids = torch.argmax(predictions[0, masked_index])
vocab_size = len(tokenizer)
# Find the highest predicted token with an ID lower than 28997
for i in range(vocab_size):
    if predicted_token_ids <= 28996:
        break
    predicted_token_ids = torch.argsort(predictions[0, masked_index], descending=True)[i]

# Convert the predicted token ID back to a token
predicted_token = tokenizer.convert_ids_to_tokens([predicted_token_ids])[0]

print(predicted_token)

book


Predict the top 5 words that are not part of the new embeddings:

In [197]:
# Get the predicted token IDs and their probabilities
predicted_token_probs = predictions[0, masked_index]
vocab_size = len(tokenizer)
# Create a list to store the top 5 predictions and their probabilities
top_5_predictions = []

# Find the top 5 predicted tokens with IDs lower than 28997
for i in range(vocab_size):
    if len(top_5_predictions) >= 5 or i >= vocab_size:
        break
    token_id = torch.argsort(predicted_token_probs, descending=True)[i].item()
    if token_id <= 28996:
        predicted_token = tokenizer.convert_ids_to_tokens([token_id])[0]
        top_5_predictions.append((predicted_token, predicted_token_probs[token_id].item()))

# Print the top 5 predictions and their probabilities
for token, prob in top_5_predictions:
    print(token, prob)

book 8.8527250289917
life 8.380229949951172
day 8.087793350219727
journal 8.047537803649902
story 7.998483657836914
