## Import libraries

In [27]:
import re
import string
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import BertTokenizer, TFBertModel
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset

In [28]:
max_length = 128
batch_size = 32
epochs = 5
base_model_name = "cahya/bert-base-indonesian-522M"
dataset_name = "LazarusNLP/stsb_mt_id"

## Load base model and tokenizer

In [29]:
tokenizer = BertTokenizer.from_pretrained(base_model_name)
model = TFBertModel.from_pretrained(base_model_name)

Some layers from the model checkpoint at cahya/bert-base-indonesian-522M were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at cahya/bert-base-indonesian-522M.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


## Tokenizer test

In [30]:
text_test = ['Pupuk NPK','Pupuk Nitrogen']
text_preprocessed = tokenizer(text_test, max_length=max_length, padding='max_length', truncation=True, return_tensors="tf")
text_preprocessed.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [31]:
text_preprocessed['input_ids']

<tf.Tensor: shape=(2, 128), dtype=int32, numpy=
array([[    3, 11994, 24540,  1028,     1,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2, 

In [32]:
text_preprocessed['token_type_ids']

<tf.Tensor: shape=(2, 128), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>

In [33]:
text_preprocessed['attention_mask']

<tf.Tensor: shape=(2, 128), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>

## Base model test

In [34]:
test_results = model(text_preprocessed)
test_results.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [35]:
test_encoded = test_results['pooler_output']
cosine_similarity([test_encoded[0]], [test_encoded[1]])

array([[0.95307875]], dtype=float32)

## Create Dataset

In [36]:
dataset = load_dataset(dataset_name, name="en")
dataset

Found cached dataset parquet (C:/Users/Teguh/.cache/huggingface/datasets/LazarusNLP___parquet/LazarusNLP--stsb_mt_id-53495c8bc04ac9ed/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 608.18it/s]


DatasetDict({
    validation: Dataset({
        features: ['domain', 'data', 'type', 'score', 'correlation', 'text_1', 'text_2'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['domain', 'data', 'type', 'score', 'correlation', 'text_1', 'text_2'],
        num_rows: 1379
    })
})

In [44]:
class STSBertModel(tf.keras.Model):

    def __init__(self):
        super(STSBertModel, self).__init__()

        self.bert_model = model
        self.pooling_layer = tf.keras.layers.GlobalAveragePooling1D()

    def call(self, input_data, training=False):
        input_ids = input_data['input_ids']
        attention_mask = input_data['attention_mask']

        outputs = self.bert_model(input_ids, attention_mask=attention_mask, training=training)
        pooled_output = self.pooling_layer(outputs.last_hidden_state)

        return {'sentence_embedding': pooled_output}

In [45]:
class DataSequence(tf.data.Dataset):

    def __init__(self, dataset):

        similarity = [i['similarity_score'] for i in dataset]
        self.label = [i/5.0 for i in similarity]
        self.sentence_1 = [i['sentence1'] for i in dataset]
        self.sentence_2 = [i['sentence2'] for i in dataset]
        self.text_cat = [[str(x), str(y)] for x, y in zip(self.sentence_1, self.sentence_2)]

    def __len__(self):

        return len(self.text_cat)

    def get_batch_labels(self, idx):

        return tf.constant(self.label[idx])

    def get_batch_texts(self, idx):

        inputs = tokenizer(self.text_cat[idx], padding='max_length', max_length=128, truncation=True, return_tensors="tf")
        return inputs

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

def collate_fn(texts):

    num_texts = len(texts[0]['input_ids'])
    features = list()
    for i in range(num_texts):
        features.append({'input_ids': texts[0]['input_ids'][i], 'attention_mask': texts[0]['attention_mask'][i]})

    return features

In [46]:
class CosineSimilarityLoss(tf.keras.losses.Loss):

    def __init__(self, loss_fct=tf.keras.losses.MeanSquaredError(), cos_score_transformation=tf.identity):
        super(CosineSimilarityLoss, self).__init__()
        self.loss_fct = loss_fct
        self.cos_score_transformation = cos_score_transformation
        self.cos = tf.keras.losses.CosineSimilarity(axis=1)

    def call(self, input, label):
        embedding_1 = tf.stack([inp[0] for inp in input])
        embedding_2 = tf.stack([inp[1] for inp in input])

        output = self.cos_score_transformation(self.cos(embedding_1, embedding_2))

        return self.loss_fct(output, tf.squeeze(label))

In [47]:
def model_train(dataset, epochs, learning_rate, bs):

    model = STSBertModel()

    criterion = CosineSimilarityLoss()
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    train_sequence = DataSequence(dataset)
    train_dataset = tf.data.Dataset.from_generator(train_sequence.__getitem__, output_signature=(tf.TensorSpec(shape=(None, 128), dtype=tf.int32), tf.TensorSpec(shape=(), dtype=tf.float32)))
    train_dataset = train_dataset.batch(bs).map(collate_fn).prefetch(tf.data.AUTOTUNE)

    best_acc = 0.0
    best_loss = 1000

    for epoch in range(epochs):
        total_acc_train = 0
        total_loss_train = 0.0

        for train_data, train_label in tqdm(train_dataset):
            with tf.GradientTape() as tape:
                output = model(train_data, training=True)['sentence_embedding']
                loss = criterion(output, train_label)

            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

            total_loss_train += loss.numpy()

        print(f'Epoch {epoch + 1} | Loss: {total_loss_train / len(dataset): .3f}')
        model.trainable = False

    return model

EPOCHS = 8
LEARNING_RATE = 1e-6
BATCH_SIZE = 8

# Train the model
trained_model = model_train(dataset, EPOCHS, LEARNING_RATE, BATCH_SIZE)

TypeError: Can't instantiate abstract class DataSequence with abstract methods _inputs, element_spec