# Calculating Similarity
When calculating similarity between our transformer embedded vectors, we can use any of the three similarity metrics already covered.

But first, let's create some embeddings.

In [136]:
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import pandas as pd

In [137]:
model_name = 'sentence-transformers/stsb-distilbert-base'

In [138]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [139]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "Standing on one's head at job interviews forms a lasting impression.",
    "It took him a month to finish the meal.",
    "He found a leprechaun in his walnut shell."
]

In [140]:
# initialize dictionary that will contain tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    # tokenize sentence and append to dictionary lists
    new_tokens = tokenizer.encode_plus(sentence, max_length=128, truncation=True,
                                       padding='max_length', return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

print(tokens['input_ids'])

# reformat list of tensors into single tensor
# torch.stack gabungkan tensor

tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

[tensor([  101,  2093,  2086,  2101,  1010,  1996, 13123,  2001,  2145,  2440,
         1997, 15333,  7174,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0]

In [141]:
tokens['input_ids'].shape, tokens['input_ids']

(torch.Size([6, 128]),
 tensor([[  101,  2093,  2086,  2101,  1010,  1996, 13123,  2001,  2145,  2440,
           1997, 15333,  7174,  1012,   102,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0, 

In [142]:
outputs = model(**tokens)
outputs.keys(), outputs

(odict_keys(['last_hidden_state']),
 BaseModelOutput(last_hidden_state=tensor([[[ 0.8595, -0.4167,  0.2726,  ...,  0.2641,  1.2247, -0.8213],
          [ 0.7735,  0.1024,  0.6709,  ..., -0.0553,  0.4683, -1.2978],
          [ 1.2928,  0.0555,  0.8122,  ..., -0.6352,  0.6269, -2.0369],
          ...,
          [ 0.5782, -0.0531,  0.1934,  ...,  0.2532,  0.9440, -0.6694],
          [ 0.6663, -0.0837,  0.3519,  ...,  0.3226,  1.0445, -0.7756],
          [ 0.9282, -0.0499,  0.4605,  ...,  0.3597,  0.9780, -0.7612]],
 
         [[-0.1912, -0.0761, -0.2263,  ..., -0.2528, -0.4573,  1.2299],
          [-0.1747, -0.2707,  0.0206,  ...,  0.2107, -0.7694,  0.7053],
          [ 0.0048,  0.2234, -0.0399,  ...,  0.1220, -0.4588,  0.5472],
          ...,
          [-0.5226,  0.2483, -0.0838,  ..., -0.2564, -0.2952,  1.1068],
          [-0.5535,  0.2634, -0.3198,  ..., -0.0839, -0.2407,  1.1977],
          [-0.4390,  0.1172, -0.1790,  ..., -0.0739, -0.1049,  1.1897]],
 
         [[-0.5386,  0.1490,  

The dense vector representations of our text are contained within the outputs 'last_hidden_state' tensor, which we access like so:

In [143]:
embeddings = outputs.last_hidden_state
embeddings.shape, embeddings

(torch.Size([6, 128, 768]),
 tensor([[[ 0.8595, -0.4167,  0.2726,  ...,  0.2641,  1.2247, -0.8213],
          [ 0.7735,  0.1024,  0.6709,  ..., -0.0553,  0.4683, -1.2978],
          [ 1.2928,  0.0555,  0.8122,  ..., -0.6352,  0.6269, -2.0369],
          ...,
          [ 0.5782, -0.0531,  0.1934,  ...,  0.2532,  0.9440, -0.6694],
          [ 0.6663, -0.0837,  0.3519,  ...,  0.3226,  1.0445, -0.7756],
          [ 0.9282, -0.0499,  0.4605,  ...,  0.3597,  0.9780, -0.7612]],
 
         [[-0.1912, -0.0761, -0.2263,  ..., -0.2528, -0.4573,  1.2299],
          [-0.1747, -0.2707,  0.0206,  ...,  0.2107, -0.7694,  0.7053],
          [ 0.0048,  0.2234, -0.0399,  ...,  0.1220, -0.4588,  0.5472],
          ...,
          [-0.5226,  0.2483, -0.0838,  ..., -0.2564, -0.2952,  1.1068],
          [-0.5535,  0.2634, -0.3198,  ..., -0.0839, -0.2407,  1.1977],
          [-0.4390,  0.1172, -0.1790,  ..., -0.0739, -0.1049,  1.1897]],
 
         [[-0.5386,  0.1490,  0.2582,  ...,  0.7013,  0.3570,  0.2032],


In [144]:
attention_mask = tokens['attention_mask']
attention_mask.shape, attention_mask

(torch.Size([6, 128]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     

In [145]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape, mask

(torch.Size([6, 128, 768]),
 tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [0., 0., 0.

In [146]:
masked_embeddings = embeddings * mask
masked_embeddings.shape, masked_embeddings

(torch.Size([6, 128, 768]),
 tensor([[[ 0.8595, -0.4167,  0.2726,  ...,  0.2641,  1.2247, -0.8213],
          [ 0.7735,  0.1024,  0.6709,  ..., -0.0553,  0.4683, -1.2978],
          [ 1.2928,  0.0555,  0.8122,  ..., -0.6352,  0.6269, -2.0369],
          ...,
          [ 0.0000, -0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000],
          [ 0.0000, -0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000],
          [ 0.0000, -0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000]],
 
         [[-0.1912, -0.0761, -0.2263,  ..., -0.2528, -0.4573,  1.2299],
          [-0.1747, -0.2707,  0.0206,  ...,  0.2107, -0.7694,  0.7053],
          [ 0.0048,  0.2234, -0.0399,  ...,  0.1220, -0.4588,  0.5472],
          ...,
          [-0.0000,  0.0000, -0.0000,  ..., -0.0000, -0.0000,  0.0000],
          [-0.0000,  0.0000, -0.0000,  ..., -0.0000, -0.0000,  0.0000],
          [-0.0000,  0.0000, -0.0000,  ..., -0.0000, -0.0000,  0.0000]],
 
         [[-0.5386,  0.1490,  0.2582,  ...,  0.7013,  0.3570,  0.2032],


In [147]:
summed = torch.sum(masked_embeddings, 1)
summed.shape, summed

(torch.Size([6, 768]),
 tensor([[ 14.3088,  -2.9345,   8.4755,  ...,   1.2702,  13.5866, -17.3509],
         [ -1.9532,   3.5471,  -1.8108,  ...,   3.3763,  -8.7374,  21.2288],
         [ -3.7284,   5.2960,  10.8504,  ...,  12.8177,   1.2172,  -2.1437],
         [  4.2427,   0.9002,   3.6707,  ...,  -7.6842,   4.1800,   1.5905],
         [  1.5297,   4.1886,   4.5376,  ...,  -5.8318,   6.1333,   1.5439],
         [ -2.2378,  -5.5957, -10.2656,  ...,   0.9247,  10.5801,   0.1131]],
        grad_fn=<SumBackward1>))

In [148]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape, summed_mask

(torch.Size([6, 768]),
 tensor([[15., 15., 15.,  ..., 15., 15., 15.],
         [22., 22., 22.,  ..., 22., 22., 22.],
         [15., 15., 15.,  ..., 15., 15., 15.],
         [16., 16., 16.,  ..., 16., 16., 16.],
         [12., 12., 12.,  ..., 12., 12., 12.],
         [14., 14., 14.,  ..., 14., 14., 14.]]))

In [149]:
mean_pooled = summed / summed_mask

In [150]:
mean_pooled

tensor([[ 0.9539, -0.1956,  0.5650,  ...,  0.0847,  0.9058, -1.1567],
        [-0.0888,  0.1612, -0.0823,  ...,  0.1535, -0.3972,  0.9649],
        [-0.2486,  0.3531,  0.7234,  ...,  0.8545,  0.0811, -0.1429],
        [ 0.2652,  0.0563,  0.2294,  ..., -0.4803,  0.2613,  0.0994],
        [ 0.1275,  0.3491,  0.3781,  ..., -0.4860,  0.5111,  0.1287],
        [-0.1598, -0.3997, -0.7333,  ...,  0.0661,  0.7557,  0.0081]],
       grad_fn=<DivBackward0>)

Let's calculate cosine similarity for sentence 0:

Program mean_pooled.detach().numpy() berfungsi untuk mengambil nilai array numpy dari tensor yang telah di-detach dan dihitung nilai rata-ratanya (mean pooled).

Penjelasan lebih detailnya:

- mean_pooled: variabel yang berisi tensor dengan nilai rata-rata (mean) setiap kolomnya.
- .detach(): metode untuk menghapus koneksi tensor dari graf komputasi sehingga tidak mempengaruhi perhitungan backward gradient.
- .numpy(): metode untuk mengubah tensor menjadi array numpy.

Dengan demikian, mean_pooled.detach().numpy() akan menghasilkan array numpy dengan nilai rata-rata setiap kolom dari tensor mean_pooled.

Menghapus koneksi tensor dari graf komputasi mengacu pada operasi pemutusan hubungan antara tensor dan graf komputasi yang digunakan untuk menghitung gradien dalam proses backpropagation pada jaringan saraf. Ketika kita memutuskan hubungan antara tensor dan graf komputasi, kita menghilangkan jejak koneksi tensor yang terkait dengan proses pembuatan graf tersebut. Hal ini dapat membantu kita mengurangi penggunaan memori dan meningkatkan efisiensi komputasi pada jaringan saraf yang kompleks. Selain itu, dengan memutuskan hubungan tensor dari graf komputasi, kita juga dapat memproses tensor secara independen dan mengoptimalkan performa dari perangkat keras seperti GPU dan TPU.

In [151]:
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

# calculate

cosine = cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

cosine

array([[ 0.04432429,  0.29033935, -0.00305667,  0.02244765,  0.2989486 ]],
      dtype=float32)

cosine_similarity adalah sebuah fungsi yang digunakan untuk menghitung cosine similarity antara dua set vektor. Cosine similarity adalah sebuah metrik yang digunakan untuk mengukur seberapa mirip dua set vektor dalam ruang dimensi yang sama. Semakin dekat nilai cosine similarity antara dua vektor ke 1, semakin mirip kedua vektor tersebut. Fungsi cosine_similarity dapat digunakan untuk mengukur kesamaan antara dua set dokumen atau teks, sehingga berguna dalam proses information retrieval atau natural language processing.

In [172]:
data = {
    "Sentence": sentences[1:],
    "Similarity": cosine[0] # to replace from array() tuple / next(iter(cosine))
}

create_table = pd.DataFrame(data)

In [173]:
create_table

Unnamed: 0,Sentence,Similarity
0,The fish dreamed of escaping the fishbowl and ...,0.044324
1,The person box was packed with jelly many doze...,0.290339
2,Standing on one's head at job interviews forms...,-0.003057
3,It took him a month to finish the meal.,0.022448
4,He found a leprechaun in his walnut shell.,0.298949


So, as intended, the most similar sentence is that in index 2 - which contains the same meaning as our first sentence, without using the same words:

"Three years later, the coffin was still full of Jello."