# Building Dense Vectors Using Transformers
We will be using the sentence-transformers/stsb-distilbert-base model to build our dense vectors.

https://github.com/jamescalam/transformers/blob/main/course/similarity/02_similarity_metrics.ipynb => metrics similarity

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

In [2]:
model_name = 'sentence-transformers/stsb-distilbert-base'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/489 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Perbedaan antara kedua metode ini adalah bahwa encode_plus memungkinkan pengguna untuk memasukkan lebih banyak parameter seperti max_length, truncation, dan padding. sedangkan Encode adalah proses mengubah data atau informasi menjadi bentuk yang dapat diproses oleh komputer atau sistem lainnya.

In [16]:
text = "hello world what a time to be alive!"

tokens = tokenizer.encode_plus(text, max_length=128,
                               truncation=True, padding='max_length',
                               return_tensors='pt')

In [25]:
tokens

{'input_ids': tensor([[ 101, 7592, 2088, 2054, 1037, 2051, 2000, 2022, 4142,  999,  102,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0,

In [18]:
outputs = model(**tokens) # ** asteris 2 merepresentasikan tipe data dict yang memiliki key variabel
outputs

BaseModelOutput(last_hidden_state=tensor([[[-0.9489,  0.6905, -0.2188,  ...,  0.0161,  0.5874, -0.1449],
         [-0.6643,  1.1984, -0.1346,  ...,  0.4839,  0.6338, -0.5003],
         [-0.3289,  0.6412,  0.2473,  ..., -0.0965,  0.4298,  0.0515],
         ...,
         [-0.7853,  0.8094, -0.2639,  ...,  0.2177,  0.3335,  0.1107],
         [-0.7528,  0.6285, -0.0088,  ...,  0.1024,  0.4585,  0.1720],
         [-1.0754,  0.4878, -0.3458,  ...,  0.2764,  0.5604,  0.1236]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

outputs.last_hidden_state adalah tensor yang berisi representasi token terakhir dari setiap sequence dalam batch. Representasi ini dapat digunakan sebagai input untuk tugas downstream seperti klasifikasi teks atau NER

In [19]:
embeddings = outputs.last_hidden_state
embeddings

tensor([[[-0.9489,  0.6905, -0.2188,  ...,  0.0161,  0.5874, -0.1449],
         [-0.6643,  1.1984, -0.1346,  ...,  0.4839,  0.6338, -0.5003],
         [-0.3289,  0.6412,  0.2473,  ..., -0.0965,  0.4298,  0.0515],
         ...,
         [-0.7853,  0.8094, -0.2639,  ...,  0.2177,  0.3335,  0.1107],
         [-0.7528,  0.6285, -0.0088,  ...,  0.1024,  0.4585,  0.1720],
         [-1.0754,  0.4878, -0.3458,  ...,  0.2764,  0.5604,  0.1236]]],
       grad_fn=<NativeLayerNormBackward0>)

In [20]:
embeddings.shape

torch.Size([1, 128, 768])

In [21]:
attention_mask = tokens['attention_mask']
attention_mask.shape, attention_mask

(torch.Size([1, 128]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0]]))

In [22]:
attention_mask.unsqueeze(-1).shape, attention_mask.unsqueeze(-1)

(torch.Size([1, 128, 1]),
 tensor([[[1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],

In [23]:
attention_mask.unsqueeze(-1).expand(embeddings.shape).shape, attention_mask.unsqueeze(-1).expand(embeddings.shape)

(torch.Size([1, 128, 768]),
 tensor([[[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]]]))

In [26]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.shape).float()

Masked Embedding pada NLP adalah representasi vektor dari kata-kata dalam dokumen teks yang digunakan untuk memproses teks dalam bentuk numerik dengan memasukkan token khusus yang menunjukkan posisi di mana token tersebut harus disembunyikan1

In [46]:
masked_embeddings = embeddings * mask
masked_embeddings.shape, masked_embeddings

(torch.Size([1, 128, 768]),
 tensor([[[-0.9489,  0.6905, -0.2188,  ...,  0.0161,  0.5874, -0.1449],
          [-0.6643,  1.1984, -0.1346,  ...,  0.4839,  0.6338, -0.5003],
          [-0.3289,  0.6412,  0.2473,  ..., -0.0965,  0.4298,  0.0515],
          ...,
          [-0.0000,  0.0000, -0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [-0.0000,  0.0000, -0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [-0.0000,  0.0000, -0.0000,  ...,  0.0000,  0.0000,  0.0000]]],
        grad_fn=<MulBackward0>))

Fungsi dari program torch.sum adalah untuk menjumlahkan nilai-nilai pada tensor masked_embeddings pada dimensi ke-1.

In [28]:
summed = torch.sum(masked_embeddings, 1)
summed.shape, summed

(torch.Size([1, 768]),
 tensor([[-4.2333e+00,  8.5918e+00, -1.9492e+00, -1.5538e+01, -2.5694e+00,
           9.9980e+00, -8.6229e-01,  6.6381e+00,  7.4674e-01, -4.3826e+00,
           4.3145e-01, -5.1452e+00, -7.9033e+00, -1.3049e+00, -7.8313e-01,
           7.2620e-02, -1.5502e+00,  3.4398e+00, -7.2131e+00, -1.8117e+00,
          -1.1028e+00, -4.2192e+00,  6.7406e-01, -8.1200e+00, -6.5910e+00,
           3.1741e+00,  9.5010e+00,  6.4226e+00, -3.8564e-01,  4.7517e+00,
          -5.5122e+00, -4.7848e+00,  2.5848e+00, -4.0840e+00, -1.1049e+01,
           1.1000e+01, -2.3100e+01, -3.5476e+00, -1.7694e+00, -8.1072e+00,
           6.0421e+00, -1.3273e+00,  7.9968e+00, -5.5361e-01, -1.9299e+01,
           8.8631e+00, -5.5608e+00, -5.1889e+00, -1.8400e+00,  6.5700e+00,
           6.1823e+00, -3.9714e+00, -1.8072e+00, -6.0736e+00, -5.5459e+00,
           6.1806e+00, -1.2557e+01,  1.1848e+01,  6.1258e+00, -7.7695e-01,
          -2.9625e+00, -7.5796e+00,  1.9902e+00,  3.4150e+00,  4.2940e-01,
  

Fungsi dari program tersebut adalah untuk membatasi nilai tensor mask.sum(1) pada rentang nilai minimum 1e-9 dan nilai maksimum yang tidak terbatas.

1e-9 adalah notasi ilmiah yang merepresentasikan bilangan pecahan 0.000000001 atau 1 dibagi dengan 1 milyar.

In [30]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([1, 768])

In [31]:
summed_mask

tensor([[11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11.

In [42]:
mean_pooled = summed / summed_mask

And that is how we calculate our dense similarity vector.

In [43]:
mean_pooled

tensor([[-3.8485e-01,  7.8107e-01, -1.7720e-01, -1.4125e+00, -2.3358e-01,
          9.0891e-01, -7.8390e-02,  6.0347e-01,  6.7885e-02, -3.9841e-01,
          3.9223e-02, -4.6774e-01, -7.1848e-01, -1.1863e-01, -7.1193e-02,
          6.6018e-03, -1.4093e-01,  3.1271e-01, -6.5574e-01, -1.6470e-01,
         -1.0026e-01, -3.8357e-01,  6.1278e-02, -7.3818e-01, -5.9918e-01,
          2.8855e-01,  8.6372e-01,  5.8388e-01, -3.5058e-02,  4.3197e-01,
         -5.0111e-01, -4.3498e-01,  2.3498e-01, -3.7127e-01, -1.0044e+00,
          1.0000e+00, -2.1000e+00, -3.2251e-01, -1.6085e-01, -7.3701e-01,
          5.4928e-01, -1.2066e-01,  7.2698e-01, -5.0328e-02, -1.7545e+00,
          8.0573e-01, -5.0553e-01, -4.7172e-01, -1.6727e-01,  5.9727e-01,
          5.6203e-01, -3.6104e-01, -1.6429e-01, -5.5215e-01, -5.0418e-01,
          5.6187e-01, -1.1415e+00,  1.0771e+00,  5.5689e-01, -7.0632e-02,
         -2.6932e-01, -6.8905e-01,  1.8093e-01,  3.1045e-01,  3.9036e-02,
          3.1064e-01, -4.4495e-01, -4.