In [183]:
sentences = ["search mail", "read mail","send mail","Go back","Search mails by name","starred mails","unread mails","full inbox"]

In [184]:
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

# initialize dictionary that will contain tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    # tokenize sentence and append to dictionary lists
    new_tokens = tokenizer.encode_plus(sentence, max_length=128, truncation=True,
                                       padding='max_length', return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [185]:
tokens['input_ids'].shape

torch.Size([8, 128])

In [186]:
outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [187]:
embeddings = outputs.last_hidden_state
embeddings

tensor([[[-0.3942,  0.3029,  1.4472,  ...,  0.0101,  0.0862,  0.0922],
         [-0.2035,  0.6168,  2.0040,  ..., -0.3405,  0.0207, -0.0964],
         [ 0.0917,  0.4722,  1.3937,  ..., -0.1057, -0.0399, -0.2460],
         ...,
         [ 0.0147,  0.2153,  1.2861,  ..., -0.2732, -0.0982, -0.0100],
         [-0.2540, -0.0648,  1.1406,  ...,  0.0134, -0.0648, -0.2483],
         [-0.1354,  0.0121,  1.1041,  ..., -0.0334, -0.1695, -0.1625]],

        [[-0.2258, -0.0273,  1.1741,  ...,  0.9997,  0.0922, -0.2964],
         [-0.2618,  0.2790,  1.5802,  ...,  0.6187,  0.0968, -0.4657],
         [ 0.0254,  0.0281,  1.3444,  ...,  0.6779, -0.1042, -0.3806],
         ...,
         [ 0.0717, -0.0417,  1.2423,  ...,  0.5344, -0.0085, -0.1196],
         [-0.1413, -0.2517,  0.9939,  ...,  0.7855, -0.0623, -0.3433],
         [-0.0742, -0.0685,  1.0065,  ...,  0.5816, -0.2019, -0.1838]],

        [[-0.0521, -0.0542,  1.1683,  ...,  0.7507, -0.3254, -0.0666],
         [ 0.1002,  0.1243,  1.8430,  ...,  0

In [188]:
embeddings.shape

torch.Size([8, 128, 768])

In [189]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([8, 128])

In [190]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([8, 128, 768])

In [191]:
mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0., 

In [192]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([8, 128, 768])

In [193]:
masked_embeddings

tensor([[[-0.3942,  0.3029,  1.4472,  ...,  0.0101,  0.0862,  0.0922],
         [-0.2035,  0.6168,  2.0040,  ..., -0.3405,  0.0207, -0.0964],
         [ 0.0917,  0.4722,  1.3937,  ..., -0.1057, -0.0399, -0.2460],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000, -0.0000, -0.0000],
         [-0.0000, -0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000],
         [-0.0000,  0.0000,  0.0000,  ..., -0.0000, -0.0000, -0.0000]],

        [[-0.2258, -0.0273,  1.1741,  ...,  0.9997,  0.0922, -0.2964],
         [-0.2618,  0.2790,  1.5802,  ...,  0.6187,  0.0968, -0.4657],
         [ 0.0254,  0.0281,  1.3444,  ...,  0.6779, -0.1042, -0.3806],
         ...,
         [ 0.0000, -0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000],
         [-0.0000, -0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000],
         [-0.0000, -0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000]],

        [[-0.0521, -0.0542,  1.1683,  ...,  0.7507, -0.3254, -0.0666],
         [ 0.1002,  0.1243,  1.8430,  ...,  0

In [194]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([8, 768])

In [195]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([8, 768])

In [196]:
summed_mask

tensor([[4., 4., 4.,  ..., 4., 4., 4.],
        [4., 4., 4.,  ..., 4., 4., 4.],
        [4., 4., 4.,  ..., 4., 4., 4.],
        ...,
        [5., 5., 5.,  ..., 5., 5., 5.],
        [7., 7., 7.,  ..., 7., 7., 7.],
        [5., 5., 5.,  ..., 5., 5., 5.]])

In [197]:
mean_pooled = summed / summed_mask

In [198]:
mean_pooled

tensor([[-0.2447,  0.5197,  1.7501,  ..., -0.0472, -0.0622, -0.0422],
        [-0.2232,  0.0823,  1.5360,  ...,  0.8506, -0.0146, -0.3242],
        [ 0.0622,  0.0731,  1.6312,  ...,  0.6340, -0.3536, -0.1498],
        ...,
        [ 0.1445, -0.1180,  1.3465,  ..., -0.1240,  0.0778, -0.5535],
        [ 0.0913,  0.4663,  0.9869,  ...,  0.0653, -0.4017, -0.4818],
        [-0.5792, -0.2039,  0.4067,  ...,  0.0853, -0.0118, -0.1104]],
       grad_fn=<DivBackward0>)

In [199]:
from sklearn.metrics.pairwise import cosine_similarity

In [200]:
mean_pooled = mean_pooled.detach().numpy()

# calculate
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

array([[0.7906319 , 0.8302841 , 0.4992538 , 0.81634897, 0.61364186,
        0.6586647 , 0.49970666]], dtype=float32)