In [2]:
import tensorflow as tf
from transformers import BertTokenizer
from datasets import load_dataset
import sys
sys.path.append('/kaggle/input/bertmodel/new')
from bert import minBert
from bert import EncoderLayer, SelfAttention, FeedForward, BaseAttention
from positional_embedding import PositionalEmbedding

In [3]:
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
from scipy.stats import spearmanr

# Encode sentences
def encode_sentences(model, tokenizer, sentences):
    inputs = tokenizer(sentences, return_tensors="tf", padding=True, truncation=True, max_length=256)
    embeddings = model(inputs["input_ids"])[:, 0, :]  # CLS embedding
    normalized_embeddings = tf.nn.l2_normalize(embeddings, axis=1)
    return normalized_embeddings

# Evaluate similarity
def cosine_similarity(emb1, emb2):
    return tf.reduce_sum(emb1 * emb2, axis=1)

# Spearman's rank correlation
def evaluate_spearman(predicted_scores, true_scores):
    return spearmanr(predicted_scores, true_scores).correlation


In [5]:
def batch_data(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]


In [6]:
import numpy as np

def encode_sentences_with_batching(model, tokenizer, sentences, batch_size=32):
    embeddings = []
    
    for batch in batch_data(sentences, batch_size):
        # Encode batch hiện tại
        batch_embedding = encode_sentences(model, tokenizer, batch)
        embeddings.append(batch_embedding)
    
    # Kết hợp tất cả embeddings thành một tensor numpy
    return np.concatenate(embeddings, axis=0)


In [7]:
def calculate_spearman_correlation(benchmark_dataset_name, model, tokenizer, sentences):
    embedding_s1 = encode_sentences_with_batching(model, tokenizer, sentences[0])
    embedding_s2 = encode_sentences_with_batching(model, tokenizer, sentences[1])
    true_scores = sentences[2]
    predicted_scores = cosine_similarity(embedding_s1, embedding_s2)

    spearman_corr = evaluate_spearman(predicted_scores.numpy(), true_scores)
    print(f"Spearman's correlation of {benchmark_dataset_name}: {spearman_corr:.4f}")

In [8]:
def load_benchmark_dataset(dataset_name):
    dataset = load_dataset(dataset_name)
    
    sentence_pairs = dataset['test']  # Get a subset of test data for evaluation
    true_scores = sentence_pairs['score']  # True similarity scores

    # Encode the sentences and calculate predicted similarity
    sentences1 = sentence_pairs['sentence1']
    sentences2 = sentence_pairs['sentence2']

    return [sentences1, sentences2, true_scores]


def load_all_benchmark_dataset():
    stsbenchmark_sts = load_benchmark_dataset('mteb/stsbenchmark-sts')
    sickr_sts = load_benchmark_dataset('mteb/sickr-sts')
    sts12_sts = load_benchmark_dataset('mteb/sts12-sts')
    sts14_sts = load_benchmark_dataset('mteb/sts14-sts')

    return [{'data': stsbenchmark_sts, 'name': 'stsbenchmark_sts'}, {'data': sickr_sts, 'name': 'sickr_sts'}, {'data': sts12_sts, 'name': 'sts12_sts'}, {'data': sts14_sts, 'name': 'sts14_sts'} ]

In [9]:
min_bert_layer_init = minBert(
    name = 'minbert',
    num_layers=4,  # Số lượng lớp encoder
    d_model=256,   # Kích thước vector ẩn
    num_heads=8,   # Số lượng head trong multi-head attention
    dff=1024,      # Số lượng neuron trong feed-forward network
    vocab_size=tokenizer.vocab_size,  # Kích thước từ vựng
    dropout_rate=0.1
)




In [None]:
min_bert_layer_base = tf.keras.models.load_model(
    '/kaggle/input/bertmodel/model_weight/min_bert_layer.keras',
    custom_objects={
        "minBert": minBert,
        "EncoderLayer": EncoderLayer,
        "SelfAttention": SelfAttention,
        "FeedForward": FeedForward,
        "BaseAttention" : BaseAttention,
        "PositionalEmbedding" : PositionalEmbedding
    }
)

min_bert_layer_after_unsupervised_training = tf.keras.models.load_model(
    '/kaggle/input/bertmodel/model_weight/min_bert_layer_after_unsupervised_training.keras',
    custom_objects={
        "minBert": minBert,
        "EncoderLayer": EncoderLayer,
        "SelfAttention": SelfAttention,
        "FeedForward": FeedForward,
        "BaseAttention" : BaseAttention,
        "PositionalEmbedding" : PositionalEmbedding
    }
)

min_bert_layer_after_supervised_sli_training = tf.keras.models.load_model(
    '/kaggle/input/bertmodel/model_weight/min_bert_layer_after_nli.keras',
    custom_objects={
        "minBert": minBert,
        "EncoderLayer": EncoderLayer,
        "SelfAttention": SelfAttention,
        "FeedForward": FeedForward,
        "BaseAttention" : BaseAttention,
        "PositionalEmbedding" : PositionalEmbedding
    }
)

In [None]:

min_bert_layer_after_supervised_qqp_triplet_training = tf.keras.models.load_model(
    '/kaggle/input/bertmodel/model_weight/min_bert_layer_after_training_qqp_triplet_v3.keras',
    custom_objects={
        "minBert": minBert,
        "EncoderLayer": EncoderLayer,
        "SelfAttention": SelfAttention,
        "FeedForward": FeedForward,
        "BaseAttention" : BaseAttention,
        "PositionalEmbedding" : PositionalEmbedding
    }
)

In [None]:
dataset = load_all_benchmark_dataset()

In [17]:
for data in dataset:
    calculate_spearman_correlation(data['name'], min_bert_layer_after_supervised_qqp_triplet_training, tokenizer, data['data'])

Spearman's correlation of stsbenchmark_sts: 0.5647
Spearman's correlation of sickr_sts: 0.5281
Spearman's correlation of sts12_sts: 0.4706
Spearman's correlation of sts14_sts: 0.4840
