In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm
.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 116kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<?, ?B/s] 
README.md: 100%|██████████| 10.6k/10.6k [00:00<?, ?B/s]
config.json: 100%|██████████| 612/612 [00:00<?, ?B/s] 
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 7.47kB/s]
data_config.json: 100%|██████████| 39.3k/39.3k [00:00<?, ?B/s]
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:06<00:00, 15.0MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<?, ?B/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 7.24kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 744kB/s]
tokenizer_config.json: 100%|██████████| 350/350 [00:00<?, ?B/s] 
train_script.py: 100%|██████████| 13.2k/13.2k [00:00<?, ?B/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.08MB/s]
modules.json: 100%|██████████| 349/349 [00:00<?, ?B/s] 


In [1]:
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


---

### Usage (Sentence-Transformers)

In [8]:
from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [9]:
sentences = ["This is an example sentence", "Each sentence is converted"]
embeddings = model.encode(sentences)
print(embeddings.shape)
print(embeddings)

(2, 768)
[[ 0.02250257 -0.07829178 -0.02303074 ... -0.00827928  0.02652692
  -0.00201897]
 [ 0.04170237  0.00109738 -0.01553418 ... -0.02181629 -0.06359358
  -0.00875285]]


---

### Usage (HuggingFace Transformers)

In [11]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [12]:
# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

In [13]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

tokenizer_config.json: 100%|██████████| 363/363 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 510kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 812kB/s]
special_tokens_map.json: 100%|██████████| 239/239 [00:00<?, ?B/s] 
config.json: 100%|██████████| 571/571 [00:00<00:00, 66.2kB/s]
pytorch_model.bin: 100%|██████████| 438M/438M [00:31<00:00, 14.1MB/s] 


In [47]:
# print(tokenizer.tokenize(sentences, add_special_tokens=True, padding=False))
print(tokenizer.tokenize(sentences, add_special_tokens=True))

['<s>', 'this', 'is', 'an', 'example', 'sentence', '</s>', '</s>', 'each', 'sentence', 'is', 'converted', '</s>']


In [14]:
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

In [37]:
print(type(encoded_input))
print(encoded_input)
print(encoded_input["input_ids"])
print(encoded_input["attention_mask"])
print("no. of tokens in a sentence = ", encoded_input["input_ids"].shape[1])

<class 'transformers.tokenization_utils_base.BatchEncoding'>
{'input_ids': tensor([[   0, 2027, 2007, 2023, 2746, 6255,    2],
        [   0, 2173, 6255, 2007, 4995,    2,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0]])}
tensor([[   0, 2027, 2007, 2023, 2746, 6255,    2],
        [   0, 2173, 6255, 2007, 4995,    2,    1]])
tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0]])
no. of tokens in a sentence =  7


In [20]:
# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

In [50]:
print(type(model_output))
print(len(model_output))
print(f"last_hidden_state", "==>",model_output[0].shape, model_output.last_hidden_state.shape)
print(f"pooler_output", "==>", model_output[1].shape, model_output.pooler_output.shape)
print(model_output.pooler_output)

<class 'transformers.modeling_outputs.BaseModelOutputWithPooling'>
2
last_hidden_state ==> torch.Size([2, 7, 768]) torch.Size([2, 7, 768])
pooler_output ==> torch.Size([2, 768]) torch.Size([2, 768])
tensor([[ 0.0880, -0.0418,  0.0182,  ...,  0.0867, -0.0284, -0.0387],
        [-0.0302, -0.0482, -0.0346,  ...,  0.0619, -0.0250,  0.0269]])


In [48]:
# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [51]:
print(sentence_embeddings)
# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
print(sentence_embeddings)

tensor([[ 0.0616, -0.2143, -0.0630,  ..., -0.0227,  0.0726, -0.0055],
        [ 0.1287,  0.0034, -0.0479,  ..., -0.0673, -0.1962, -0.0270]])
tensor([[ 0.0225, -0.0783, -0.0230,  ..., -0.0083,  0.0265, -0.0020],
        [ 0.0417,  0.0011, -0.0155,  ..., -0.0218, -0.0636, -0.0088]])


---

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

---

In [52]:
from transformers import BertModel, BertTokenizer
import torch

In [54]:
model = BertModel.from_pretrained('bert-base-uncased')

config.json: 100%|██████████| 570/570 [00:00<00:00, 211kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
model.safetensors: 100%|██████████| 440M/440M [00:53<00:00, 8.31MB/s] 


In [55]:
sentence = 'She is a MachineLearning Engineer and works in California'

In [56]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 3.73kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 530kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 670kB/s]


In [60]:
tokens = tokenizer.tokenize(sentence)

In [61]:
print(tokens)

['she', 'is', 'a', 'machine', '##lea', '##rn', '##ing', 'engineer', 'and', 'works', 'in', 'california']


In [62]:
tokens = ['[CLS]'] + tokens + ['[SEP]']

In [63]:
print(tokens)

['[CLS]', 'she', 'is', 'a', 'machine', '##lea', '##rn', '##ing', 'engineer', 'and', 'works', 'in', 'california', '[SEP]']


In [64]:
tokens = tokens + ['[PAD]'] + ['[PAD]']

In [67]:
print(tokens)
print(len(tokens))

['[CLS]', 'she', 'is', 'a', 'machine', '##lea', '##rn', '##ing', 'engineer', 'and', 'works', 'in', 'california', '[SEP]', '[PAD]', '[PAD]']
16


In [66]:
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]
print(attention_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]


In [69]:
#unique token ID
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)
print(tokens)

[101, 2016, 2003, 1037, 3698, 19738, 6826, 2075, 3992, 1998, 2573, 1999, 2662, 102, 0, 0]
['[CLS]', 'she', 'is', 'a', 'machine', '##lea', '##rn', '##ing', 'engineer', 'and', 'works', 'in', 'california', '[SEP]', '[PAD]', '[PAD]']


In [70]:
token_ids = torch.tensor(token_ids).unsqueeze(0)
attention_mask = torch.tensor(attention_mask).unsqueeze(0)

In [75]:
# Getting the embedding
output = model(token_ids, attention_mask = attention_mask)

print(type(output))
print(len(output))
print(f"last_hidden_state", "==>",output[0].shape, output.last_hidden_state.shape)
print(output.last_hidden_state)
print(f"pooler_output", "==>", output[1].shape, output.pooler_output.shape)

<class 'transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions'>
2
last_hidden_state ==> torch.Size([1, 16, 768]) torch.Size([1, 16, 768])
tensor([[[-0.1925,  0.1684, -0.4252,  ..., -0.2599,  0.3736,  0.0529],
         [ 0.2417, -0.2748, -0.4909,  ...,  0.1372,  0.3408, -0.4655],
         [-0.0871,  0.0837,  0.2605,  ..., -0.4635, -0.0462,  0.2621],
         ...,
         [ 0.6711, -0.0076, -0.3847,  ..., -0.1289, -0.5171, -0.8002],
         [-0.2731,  0.1098, -0.5440,  ...,  0.0314,  0.4467, -0.3448],
         [-0.2387,  0.0119, -0.4760,  ...,  0.4656,  0.5837, -0.3774]]],
       grad_fn=<NativeLayerNormBackward0>)
pooler_output ==> torch.Size([1, 768]) torch.Size([1, 768])
