In [50]:
import pandas as pd

In [114]:
sentences = [
             'Down, down, down. There was nothing else to do, so Alice soon began talking again.',
             'Presently she began again.',
             'Dinah my dear! I wish you were down here with me! There are no mice in the air, I’m afraid, but you might catch a bat, and that’s very like a mouse, you know.',
             'Suddenly she came upon a little three-legged table, all made of solid glass; there was nothing on it except a tiny golden key, and Alice’s first thought was that it might belong to one of the doors of the hall; but, alas! either the locks were too large, or the key was too small, but at any rate it would not open any of them.',
             'she knelt down and looked along the passage into the loveliest garden you ever saw.' ,
             'Alice opened the door and found that it led into a small passage, not much larger than a rat-hole.', 
             'she could see it quite plainly through the glass, and she tried her best to climb up one of the legs of the table, but it was too slippery; and when she had tired herself out with trying, the poor little thing sat down and cried.',
             'Oh dear, what nonsense I’m talking!”',
             'After a time she heard a little pattering of feet in the distance, and she hastily dried her eyes to see what was coming.',
             'However, the Multiplication Table doesn’t signify: let’s try Geography.',
]

In [52]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [53]:
from sentence_transformers import SentenceTransformer

In [116]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
sentence_embeddings = model.encode(sentences)

In [117]:
sentence_embeddings.shape

(10, 384)

In [118]:
from sklearn.metrics.pairwise import cosine_similarity


In [120]:
cosine_similarity(
    [sentence_embeddings[0]],
    sentence_embeddings[1:])

array([[0.34808955, 0.24615395, 0.44296294, 0.25625923, 0.4576106 ,
        0.273158  , 0.1791234 , 0.35302007, 0.05745126]], dtype=float32)

In [121]:
from transformers import AutoTokenizer, AutoModel
import torch

In [122]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

In [123]:
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    # encode each sentence and append to dictionary
    new_tokens = tokenizer.encode_plus(sentence, max_length=128,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [124]:
outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [125]:
embeddings = outputs.last_hidden_state
embeddings

tensor([[[-0.5725, -0.2582,  1.2679,  ...,  1.0191, -0.6003,  0.3181],
         [-0.7022, -0.0770,  1.1637,  ...,  1.0347, -0.4110,  0.2389],
         [-0.7907,  0.2725,  1.9286,  ...,  0.7708, -0.4785,  0.2986],
         ...,
         [-0.0689, -0.3695,  0.8521,  ...,  0.9260, -0.4685,  0.3847],
         [-0.0912, -0.2562,  0.8676,  ...,  0.8459, -0.2884,  0.3101],
         [-0.1246, -0.3658,  0.7316,  ...,  0.8016, -0.3454,  0.2326]],

        [[ 0.2914, -1.0093,  2.5973,  ..., -0.3324, -0.4302, -0.0656],
         [ 0.2372, -0.7725,  2.7641,  ..., -0.5494, -0.2798, -0.2300],
         [ 0.4882, -1.0362,  2.5833,  ..., -0.3571, -0.4448, -0.1206],
         ...,
         [ 0.2622, -0.8296,  2.1079,  ..., -0.2499, -0.3384, -0.1022],
         [ 0.3420, -0.8151,  2.1221,  ..., -0.2512, -0.3642, -0.0419],
         [ 0.3693, -0.7528,  2.0748,  ..., -0.2194, -0.3325, -0.1034]],

        [[-0.7408,  0.8430,  0.1358,  ...,  1.2144, -0.0063,  1.0074],
         [-0.0702,  0.5346,  0.0220,  ...,  1

In [126]:
embeddings.shape


torch.Size([10, 128, 768])

In [127]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([10, 128])

In [128]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([10, 128, 768])

In [129]:
mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0., 

In [130]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([10, 128, 768])

In [131]:
masked_embeddings

tensor([[[-0.5725, -0.2582,  1.2679,  ...,  1.0191, -0.6003,  0.3181],
         [-0.7022, -0.0770,  1.1637,  ...,  1.0347, -0.4110,  0.2389],
         [-0.7907,  0.2725,  1.9286,  ...,  0.7708, -0.4785,  0.2986],
         ...,
         [-0.0000, -0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000],
         [-0.0000, -0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000],
         [-0.0000, -0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000]],

        [[ 0.2914, -1.0093,  2.5973,  ..., -0.3324, -0.4302, -0.0656],
         [ 0.2372, -0.7725,  2.7641,  ..., -0.5494, -0.2798, -0.2300],
         [ 0.4882, -1.0362,  2.5833,  ..., -0.3571, -0.4448, -0.1206],
         ...,
         [ 0.0000, -0.0000,  0.0000,  ..., -0.0000, -0.0000, -0.0000],
         [ 0.0000, -0.0000,  0.0000,  ..., -0.0000, -0.0000, -0.0000],
         [ 0.0000, -0.0000,  0.0000,  ..., -0.0000, -0.0000, -0.0000]],

        [[-0.7408,  0.8430,  0.1358,  ...,  1.2144, -0.0063,  1.0074],
         [-0.0702,  0.5346,  0.0220,  ...,  1

In [132]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([10, 768])

In [133]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([10, 768])

In [134]:

summed_mask

tensor([[22., 22., 22.,  ..., 22., 22., 22.],
        [ 7.,  7.,  7.,  ...,  7.,  7.,  7.],
        [47., 47., 47.,  ..., 47., 47., 47.],
        ...,
        [13., 13., 13.,  ..., 13., 13., 13.],
        [29., 29., 29.,  ..., 29., 29., 29.],
        [19., 19., 19.,  ..., 19., 19., 19.]])

In [135]:

mean_pooled = summed / summed_mask

In [136]:
mean_pooled

tensor([[-0.5726, -0.0466,  1.3504,  ...,  1.1020, -0.4344,  0.1422],
        [ 0.2931, -0.8939,  2.7762,  ..., -0.2616, -0.3088, -0.3072],
        [-0.4488,  0.8217,  0.4860,  ...,  1.1170, -0.2369,  0.6522],
        ...,
        [ 0.5556,  0.7498,  2.3417,  ..., -0.3289,  0.0677,  0.4777],
        [-0.1190, -0.0122,  1.1479,  ...,  0.6862, -1.1993,  0.5422],
        [ 0.0725,  1.2674,  0.5034,  ...,  0.3016, -0.3126,  0.2467]],
       grad_fn=<DivBackward0>)

In [137]:
mean_pooled.shape

torch.Size([10, 768])

In [138]:
mean_pooled = mean_pooled.detach().numpy()


cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[:])

array([[1.0000001 , 0.53202623, 0.5009852 , 0.46096292, 0.3418365 ,
        0.40261245, 0.50014305, 0.36782768, 0.5542    , 0.3794273 ]],
      dtype=float32)