In [None]:
import tensorflow as tf
from transformers import BertTokenizer
import sys
sys.path.append('/kaggle/input/new')
from bert import minBert
from bert import EncoderLayer, SelfAttention, FeedForward, BaseAttention
from positional_embedding import PositionalEmbedding
sys.path.append('/kaggle/input/contrastive_learning')
from contrastive_learning import QqpTripletSupervisedContrastiveModel as SupervisedContrastiveModel

In [3]:
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [None]:
min_bert_layer = tf.keras.models.load_model(
    '/kaggle/input/min_bert_layer.keras',
    custom_objects={
        "minBert": minBert,
        "EncoderLayer": EncoderLayer,
        "SelfAttention": SelfAttention,
        "FeedForward": FeedForward,
        "BaseAttention" : BaseAttention,
        "PositionalEmbedding" : PositionalEmbedding
    }
)

In [37]:
from datasets import load_dataset

# Load SNLI dataset
dataset = load_dataset("embedding-data/QQP_triplets")

In [36]:
def prepare_data(dataset, tokenizer, batch_size=32, max_length=256):
    anchors = []
    positives = []
    negatives = []

    sample_list = []

    for sample in dataset['set']:
        if len(sample['neg']) != 30: continue
        anchors.append(sample['query'])
        positives.append(sample['pos'][0])
        negatives.append(sample['neg'])
        sample_list.append(sample['pos'] + sample['neg'])
    
    print("Tổng số anchors:", len(anchors))
    print("Tổng số positives:", len(positives))
    print("Tổng số negatives:", len(negatives))
    # Kiểm tra xem có giá trị None hay không trong dữ liệu đầu vào
    if any(val is None for val in anchors + positives + negatives):
        print("Cảnh báo: Tìm thấy None trong các dữ liệu đầu vào!")

    # Tiến hành tokenization
    def tokenize_texts(texts):
        encodings = tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors="tf")
        return encodings

    anchor_encodings = tokenize_texts(anchors)
    positive_encodings = tokenize_texts(positives)
    negative_encodings = [tokenize_texts(negative) for negative in negatives]

    if anchor_encodings is None or positive_encodings is None or negative_encodings is None:
        print("Cảnh báo: Tokenizer không thành công với một số mẫu!")

    # Tạo tf.data.Dataset
    dataset = {
        'anchors': anchor_encodings['input_ids'],
        'positives': positive_encodings['input_ids'],
        'negatives': [negative_encoding['input_ids'] for negative_encoding in negative_encodings],
        'anchor_sentences': anchors,
        'sample_sentences': sample_list
    }

    return dataset

In [73]:
train_dataset = prepare_data(dataset['train'][16000:21000], tokenizer)


Tổng số anchors: 4213
Tổng số positives: 4213
Tổng số negatives: 4213


In [38]:
test_dataset = prepare_data(dataset['train'][5001: 6000], tokenizer)

Tổng số anchors: 839
Tổng số positives: 839
Tổng số negatives: 839


In [45]:
def create_tf_dataset(train_data, batch_size=32):
    # Chuyển thành tf.data.Dataset
    dataset = tf.data.Dataset.from_tensor_slices((train_data['anchors'], train_data['positives'], train_data['negatives']))
    dataset = dataset.batch(batch_size)
    return dataset

In [74]:
tf_train_dataset = create_tf_dataset(train_dataset, batch_size=4)

In [11]:
model = SupervisedContrastiveModel(min_bert_layer)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5))


In [12]:
tf.config.experimental_run_functions_eagerly(True)  # Bật eager execution


In [75]:
model.fit(tf_train_dataset, epochs=1)

[1m1054/1054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2004s[0m 2s/step - loss: 3.3661


<keras.src.callbacks.history.History at 0x7b7e493978b0>

In [78]:
min_bert_layer.save("min_bert_layer_after_training_qqp_triplet_v3.keras")

In [39]:
queries = test_dataset['anchor_sentences']
sample_sentences_list = test_dataset['sample_sentences']


In [41]:
def find_most_similar(query, dataset, model, tokenizer):
    query_encoding = tokenizer(query, return_tensors="tf", add_special_tokens=True, padding='max_length', truncation=True, max_length=256)
    query_embedding = model(query_encoding["input_ids"], training=False)[:,0,:]
    dataset_embeddings = model(tokenizer(dataset, return_tensors="tf", add_special_tokens=True, padding='max_length', truncation=True, max_length=256)["input_ids"], training=False)[:,0,:]
    similarities = tf.keras.losses.cosine_similarity(query_embedding, dataset_embeddings)
    return tf.argmax(-similarities).numpy()


In [76]:
correct_count = 0
for i in range(len(queries)):
    most_similar = find_most_similar(queries[i], sample_sentences_list[i], min_bert_layer, tokenizer)
    if most_similar == 0:
        correct_count += 1


In [77]:
print(f"correct: {correct_count} sentences, percentage: {correct_count / len(queries)}")

correct: 667 sentences, percentage: 0.7949940405244339
