In [1]:
%pip install transformers


Note: you may need to restart the kernel to use updated packages.


In [2]:
MODEL_NAME = "sentence-transformers/bert-base-nli-mean-tokens"
sentences = ["the fsb. a national savings bank. established march, 1865. chartered by the government of the united states. banking house 1507 pennsylvania avenue, opposite the treasury. deposits of five cents or any larger amounts received. six per cent interest paid on sums of five dollars or more. all deposits payable on demand, with interest due. all accounts strictly private and confidential. principal office, washington, d. c. branch offices in all the larger cities of the south and southwest. this great national savings institution, established by the authority of the united states government for the benefit of the freedmen, knows no distinction of race or color, and offers its great advantages to all classes alike. save the small sums. cut off your vices - don't smoke - don't drink - don't buy lottery tickets. put the money you save into the fsb. open from 9 a. m. to  4 p. m. each day, and on wednesday and saturday nights, to receive deposits only, from 6 1\/2 to 8 o'clock.",\
    "avoid buying costly garments",
    "spend not your hard-earned wages for filthy tobacco and useless drinks",
    "do not waste money at circuses, expensive pic-nics and excursions.",
    "cut off your vices - don't smoke - don't drink - don't buy lottery tickets.it is your duty to provide for your settlement in life"]


In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

In [5]:
tokens = {'input_ids': [], 'attention_mask': []}

In [6]:
for sentence in sentences:
    new_tokens = tokenizer.encode_plus(sentence, max_length=128, truncation=True, padding='max_length', return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

In [7]:
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [8]:
outputs = model(**tokens)

In [9]:
embeddings = outputs.last_hidden_state

In [10]:
attention = tokens['attention_mask']
mask = attention.unsqueeze(-1).expand(embeddings.shape).float()
mask_embeddings = embeddings * mask

Figure out what mean pooling stuff below means and does

In [11]:
summed = torch.sum(mask_embeddings, 1)

In [12]:
counts = torch.clamp(mask.sum(1), min=1e-9)

In [13]:
mean_pooled = summed / counts

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
mean_pooled = mean_pooled.detach().numpy()

cosine_similarity([mean_pooled[0]],mean_pooled[1:])

array([[0.07808128, 0.24313726, 0.24965587, 0.36674416]], dtype=float32)