## Import libraries

In [1]:
import re
import string
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Load base model and tokenizer

In [2]:
model_name='cahya/bert-base-indonesian-1.5G'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertModel.from_pretrained(model_name)

Some layers from the model checkpoint at cahya/bert-base-indonesian-1.5G were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at cahya/bert-base-indonesian-1.5G.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


## Tokenizer test

In [3]:
text_test = ['Pupuk NPK','Pupuk Nitrogen']
text_preprocessed = tokenizer(text_test, max_length=128, padding=True, truncation=True, return_tensors="tf")
text_preprocessed.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [4]:
text_preprocessed['input_ids']

<tf.Tensor: shape=(2, 5), dtype=int32, numpy=
array([[    3, 11994, 24540,  1028,     1],
       [    3, 11994, 10819,     1,     2]])>

In [5]:
text_preprocessed['token_type_ids']

<tf.Tensor: shape=(2, 5), dtype=int32, numpy=
array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])>

In [6]:
text_preprocessed['attention_mask']

<tf.Tensor: shape=(2, 5), dtype=int32, numpy=
array([[1, 1, 1, 1, 1],
       [1, 1, 1, 1, 0]])>

## Base model test

In [7]:
test_results = model(text_preprocessed)
test_results.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [8]:
test_encoded = test_results['pooler_output']
cosine_similarity([test_encoded[0]], [test_encoded[1]])

array([[0.96769905]], dtype=float32)

## Create Dataset

In [9]:
dataset = load_dataset("LazarusNLP/stsb_mt_id", name="en")
dataset

Found cached dataset parquet (C:/Users/Teguh/.cache/huggingface/datasets/LazarusNLP___parquet/LazarusNLP--stsb_mt_id-53495c8bc04ac9ed/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 664.08it/s]


DatasetDict({
    validation: Dataset({
        features: ['domain', 'data', 'type', 'score', 'correlation', 'text_1', 'text_2'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['domain', 'data', 'type', 'score', 'correlation', 'text_1', 'text_2'],
        num_rows: 1379
    })
})

In [10]:
train_dataset = dataset['validation']
val_dataset = train_dataset = dataset['test']

In [11]:
train_sim = [i['correlation'] for i in train_dataset]
val_sim = [i['correlation'] for i in val_dataset]
train_norm_sim = [float(i)/5.0 for i in train_sim]
val_norm_sim = [float(i)/5.0 for i in val_sim]

## Build the model

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = tokenizer(text_input)
outputs = model(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])