In [1]:
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similaritylarity

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print(f'The length of BERT base vocabulary: {len(tokenizer.vocab)}')


The length of BERT base vocabulary: 30522


In [3]:
text = 'A simple sentence'

tokens = tokenizer.encode(text)
print(tokens)

[101, 1037, 3722, 6251, 102]


In [4]:
tokenizer.decode(tokens)

'[CLS] a simple sentence [SEP]'

In [5]:
text = 'My friend told me about this class and I love it so far! She was right.'
tokens = tokenizer.encode(text)
print(tokens)

[101, 2026, 2767, 2409, 2033, 2055, 2023, 2465, 1998, 1045, 2293, 2009, 2061, 2521, 999, 2016, 2001, 2157, 1012, 102]


In [7]:
print(f'Text: {text} Num tokens: {len(tokens)}')
for t in tokens:
    print(f'Token: {t}, subword: {tokenizer.decode([t])}')

Text: My friend told me about this class and I love it so far! She was right. Num tokens: 20
Token: 101, subword: [CLS]
Token: 2026, subword: my
Token: 2767, subword: friend
Token: 2409, subword: told
Token: 2033, subword: me
Token: 2055, subword: about
Token: 2023, subword: this
Token: 2465, subword: class
Token: 1998, subword: and
Token: 1045, subword: i
Token: 2293, subword: love
Token: 2009, subword: it
Token: 2061, subword: so
Token: 2521, subword: far
Token: 999, subword: !
Token: 2016, subword: she
Token: 2001, subword: was
Token: 2157, subword: right
Token: 1012, subword: .
Token: 102, subword: [SEP]


In [9]:
text_with_unknown_words = 'Saket loves a beautiful day'
tokens_with_unknown_words = tokenizer.encode(text_with_unknown_words)
for t in tokens_with_unknown_words:
    print(f'Token: {t}, subword: {tokenizer.decode([t])}')

Token: 101, subword: [CLS]
Token: 8739, subword: sake
Token: 2102, subword: ##t
Token: 7459, subword: loves
Token: 1037, subword: a
Token: 3376, subword: beautiful
Token: 2154, subword: day
Token: 102, subword: [SEP]


In [10]:
text_with_unknown_words = 'Saket is our instructor for this awesomesauce class'
tokens_with_unknown_words = tokenizer.encode(text_with_unknown_words)
for t in tokens_with_unknown_words:
    print(f'Token: {t}, subword: {tokenizer.decode([t])}')

Token: 101, subword: [CLS]
Token: 8739, subword: sake
Token: 2102, subword: ##t
Token: 2003, subword: is
Token: 2256, subword: our
Token: 9450, subword: instructor
Token: 2005, subword: for
Token: 2023, subword: this
Token: 12476, subword: awesome
Token: 23823, subword: ##sau
Token: 3401, subword: ##ce
Token: 2465, subword: class
Token: 102, subword: [SEP]


In [11]:
text = 'My friend told me about this class and I love it so far! She was right.'
tokens = tokenizer.encode_plus(text)
print(tokens)

# attention_mask is 1 if included in the calculation of attention, otherwise 0

{'input_ids': [101, 2026, 2767, 2409, 2033, 2055, 2023, 2465, 1998, 1045, 2293, 2009, 2061, 2521, 999, 2016, 2001, 2157, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [14]:
model = BertModel.from_pretrained('bert-base-uncased')

In [15]:
python_pet = tokenizer.encode('I love my pet python')
python_language = tokenizer.encode('I love coding in python')

In [19]:
# contextful embedding of of 'python' in 'I love my pet python'
python_pet_embedding = model(torch.tensor(python_pet).unsqueeze(0))[0][:,5,:].detach().numpy()

# contextful embedding of of 'python' in 'I love coding in python'
python_language_embedding = model(torch.tensor(python_language).unsqueeze(0))[0][:,5,:].detach().numpy()

In [20]:
# contextful embedding of 'snake' in 'snake'
snake_alone_embedding = model(torch.tensor(tokenizer.encode('snake')).unsqueeze(0))[0][:,1,:].detach().numpy()

# contextful embedding of 'programming' in 'programming'
programming_alone_embedding = model(torch.tensor(tokenizer.encode('programming')).unsqueeze(0))[0][:,1,:].detach().numpy()

In [21]:
# Similarity of the representation of the word Python in a sentence about coding to the word snake
cosine_similarity(python_language_embedding, snake_alone_embedding)

array([[0.5843477]], dtype=float32)

In [24]:
# Similarity of the representation of the word Python in a sentence about pets to the word snake
cosine_similarity(python_pet_embedding, snake_alone_embedding)

array([[0.6928658]], dtype=float32)

In [26]:
# Similarity of the representation of the word Python in a sentence about coding to the word programming
cosine_similarity(python_pet_embedding, programming_alone_embedding)

array([[0.498644]], dtype=float32)

In [27]:
# Similarity of the representation of the word Python in a sentence about coding to the word programming
cosine_similarity(python_language_embedding, programming_alone_embedding)

array([[0.5614743]], dtype=float32)