Import libraries

In [17]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp311-cp311-win_amd64.whl (8.3 MB)
                                              0.0/8.3 MB ? eta -:--:--
                                              0.1/8.3 MB 3.3 MB/s eta 0:00:03
     -                                        0.4/8.3 MB 4.2 MB/s eta 0:00:02
     --                                       0.6/8.3 MB 3.9 MB/s eta 0:00:02
     ----                                     0.8/8.3 MB 4.4 MB/s eta 0:00:02
     ----                                     1.0/8.3 MB 4.2 MB/s eta 0:00:02
     -----                                    1.2/8.3 MB 4.3 MB/s eta 0:00:02
     ------                                   1.3/8.3 MB 4.0 MB/s eta 0:00:02
     -------                                  1.5/8.3 MB 4.0 MB/s eta 0:00:02
     -------                                  1.6/8.3 MB 3.8 MB/s eta 0:00:02
     --------                                 1.8/8.3 MB 3.8 MB/s eta 0:00:02
     ---------                                2.0/8.3 MB 

In [1]:
import re
import string
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


Create TokenSimiliarity class

In [2]:
class TokenSimilarity:
    def load_pretrained(self, from_pretrained:str="indobenchmark/indobert-base-p1"):
        self.tokenizer = AutoTokenizer.from_pretrained(from_pretrained)
        self.model = TFAutoModel.from_pretrained(from_pretrained)
        
    def __cleaning(self, text:str):
        # clear punctuations
        text = text.translate(str.maketrans('', '', string.punctuation))

        # clear multiple spaces
        text = re.sub(r'/s+', ' ', text).strip()

        return text
        
    def __process(self, first_token:str, second_token:str):
        inputs = self.tokenizer([first_token, second_token],
                                max_length=self.max_length,
                                truncation=self.truncation,
                                padding=self.padding,
                                return_tensors='tf')

        attention = inputs.attention_mask

        outputs = self.model(**inputs)

        # get the weights from the last layer as embeddings
        embeddings = outputs[0] # when used in older transformers version
        # embeddings = outputs.last_hidden_state # when used in newer one

        # add more dimension then expand tensor
        # to match embeddings shape by duplicating its values by rows
        mask = tf.expand_dims(attention, -1)
        mask = tf.cast(mask, tf.float32)
        mask = tf.broadcast_to(mask, tf.shape(embeddings))

        masked_embeddings = embeddings * mask
        
        # MEAN POOLING FOR 2ND DIMENSION
        # first, get sums by 2nd dimension
        # second, get counts of 2nd dimension
        # third, calculate the mean, i.e. sums/counts
        summed = tf.reduce_sum(masked_embeddings, axis=1)
        counts = tf.clip_by_value(tf.reduce_sum(mask, axis=1), clip_value_min=1e-9, clip_value_max=float('inf'))
        mean_pooled = summed/counts
        
        # return mean pooling as numpy array
        return mean_pooled.numpy()
        
    def predict(self, first_token:str, second_token:str,
                return_as_embeddings:bool=False, max_length:int=16,
                truncation:bool=True, padding:str="max_length"):
        self.max_length = max_length
        self.truncation = truncation
        self.padding = padding

        first_token = self.__cleaning(first_token)
        second_token = self.__cleaning(second_token)

        mean_pooled_arr = self.__process(first_token, second_token)
        if return_as_embeddings:
            return mean_pooled_arr

        # calculate similarity
        similarity = cosine_similarity([mean_pooled_arr[0]], [mean_pooled_arr[1]])

        return similarity

Create the model

In [3]:
model = TokenSimilarity()
model.load_pretrained('indobenchmark/indobert-large-p2')

Some layers from the model checkpoint at indobenchmark/indobert-large-p2 were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at indobenchmark/indobert-large-p2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [4]:
token1 = 'pupuk npk'
token2 = 'Pupuk npk'
token3 = 'PUPUK NPK MUTIARA 16-16-16 ORIGINAL KEMASAN PABRIK 1KG'

In [5]:
print(model.predict(token1, token2))
print(model.predict(token1, token3))

[[0.99999976]]
[[0.7087986]]
