In [8]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sys
sys.path.append('/kaggle/input/new')
from positional_embedding import PositionalEmbedding
from bert import minBert
from bert import EncoderLayer, SelfAttention, FeedForward, BaseAttention
import sentencepiece as spm
from transformers import BertTokenizer

In [None]:
mlm_data = pd.read_csv('/kaggle/input/bertmodel/bookcorpus.csv')

In [10]:
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
cls_token_id = tokenizer.cls_token_id
sep_token_id = tokenizer.sep_token_id
pad_token_id = tokenizer.pad_token_id
unk_token_id = tokenizer.unk_token_id
mask_token_id = tokenizer.mask_token_id
vocab_size = tokenizer.vocab_size

print(f"CLS Token ID: {cls_token_id}")
print(f"SEP Token ID: {sep_token_id}")
print(f"PAD Token ID: {pad_token_id}")
print(f"UNK Token ID: {unk_token_id}")
print(f"MASK Token ID: {mask_token_id}")
print(f"Vocab size: {vocab_size}")



CLS Token ID: 101SEP Token ID: 102PAD Token ID: 0UNK Token ID: 100MASK Token ID: 103Vocab size: 30522

In [11]:
def tokenize_and_mask(texts, 
                      noise=0.15, 
                      tokenizer=tokenizer):
    encoded_texts = tokenizer(texts, return_tensors="tf", add_special_tokens=True, padding=True, truncation=True, max_length=256)['input_ids']
    inp_mask = np.random.rand(*encoded_texts.shape) < noise
    inp_mask[encoded_texts == 101] = False
    inp_mask[encoded_texts == 102] = False
    inp_mask[encoded_texts == 0] = False
    
    labels = -1 * np.ones(encoded_texts.shape, dtype=int)
    # Set labels for masked tokens
    labels[inp_mask] = encoded_texts[inp_mask]

    # Prepare input
    encoded_texts_masked = np.copy(encoded_texts)
    # Set input to [MASK] which is the last token for the 90% of tokens
    # This means leaving 10% unchanged
    inp_mask_2mask = inp_mask & (np.random.rand(*encoded_texts.shape) < 0.90)
    encoded_texts_masked[
        inp_mask_2mask
    ] = tokenizer.mask_token_id  # mask token is the last in the dict

    # Set 10% to a random token
    inp_mask_2random = inp_mask_2mask & (np.random.rand(*encoded_texts.shape) < 1 / 9)
    encoded_texts_masked[inp_mask_2random] = np.random.randint(
        104, 200, inp_mask_2random.sum()
    )
    
    # Prepare sample_weights to pass to .fit() method
    sample_weights = np.ones(labels.shape)
    sample_weights[labels == -1] = 0
    
    y_labels = np.copy(encoded_texts)

    return encoded_texts_masked, y_labels, sample_weights
    

In [12]:
BATCH_SIZE = 32
x_masked_train, y_masked_labels, sample_weights = tokenize_and_mask(mlm_data['text'][0:1500000].tolist())

mlm_ds = tf.data.Dataset.from_tensor_slices(
    (x_masked_train, y_masked_labels)
)
mlm_ds = mlm_ds.shuffle(1000).batch(BATCH_SIZE)
mlm_ds

<_BatchDataset element_spec=(TensorSpec(shape=(None, 164), dtype=tf.int32, name=None), TensorSpec(shape=(None, 164), dtype=tf.int32, name=None))>

In [13]:
min_bert_layer = minBert(
    name = 'minbert',
    num_layers=4,  # Số lượng lớp encoder
    d_model=256,   # Kích thước vector ẩn
    num_heads=8,   # Số lượng head trong multi-head attention
    dff=1024,      # Số lượng neuron trong feed-forward network
    vocab_size=tokenizer.vocab_size,  # Kích thước từ vựng
    dropout_rate=0.1
)

In [14]:
mlm_model = tf.keras.Sequential([
    min_bert_layer,
    tf.keras.layers.Dense(512, activation="relu"),
    tf.keras.layers.Dense(tokenizer.vocab_size, activation="softmax")
])  

In [15]:
loss_fuction = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False,
    ignore_class= -1,
)

mlm_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=loss_fuction,
    metrics=["accuracy"],
)

In [16]:
mlm_model.fit(mlm_ds, epochs=2)

Epoch 1/2


I0000 00:00:1732672986.058951     110 service.cc:145] XLA service 0x7c06d40116f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732672986.059020     110 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
W0000 00:00:1732672987.093812     110 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert

I0000 00:00:1732673005.363010     110 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m46875/46875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7711s[0m 164ms/step - accuracy: 0.9813 - loss: 0.2249
Epoch 2/2
[1m46875/46875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7655s[0m 163ms/step - accuracy: 0.9919 - loss: 0.0510


<keras.src.callbacks.history.History at 0x7c0809957700>

In [21]:
mlm_model.save('mlmmodel.keras')

In [15]:
restored_model = tf.keras.models.load_model(
    'mlmmodel.keras',
    custom_objects={
        "minBert": minBert,
    }
)



In [16]:
mlm_model.summary()

In [17]:
min_bert_layer.summary()

In [18]:
min_bert_layer.save("min_bert_layer.keras")

In [20]:
restored_model = tf.keras.models.load_model(
    'min_bert_layer.keras',
    custom_objects={
        "minBert": minBert,
        "EncoderLayer": EncoderLayer,
        "SelfAttention": SelfAttention,
        "FeedForward": FeedForward,
        "BaseAttention" : BaseAttention,
        "PositionalEmbedding" : PositionalEmbedding
    }
)



In [21]:
restored_model.summary()

In [22]:
restored_model(tf.constant([[1011, 1012]]))

<tf.Tensor: shape=(1, 2, 256), dtype=float32, numpy=
array([[[ 7.69535363e-01, -1.21936035e+00, -5.28453887e-01,
          7.25269496e-01,  7.65354037e-01, -1.09919727e-01,
          6.66685164e-01,  5.57543099e-01, -1.86943337e-01,
         -3.65172237e-01, -1.17172098e+00,  7.52176940e-01,
          5.21580815e-01,  9.68844235e-01,  1.61041594e+00,
          1.11663975e-01, -1.14915299e+00, -1.63497522e-01,
          3.11338216e-01,  7.54375681e-02,  2.31324315e-01,
          9.62701917e-01, -1.12953115e+00,  9.22213316e-01,
          3.10716480e-01,  2.20776582e+00, -1.25946522e-01,
         -1.58513904e+00, -4.29644436e-01, -4.48094279e-01,
          5.56437492e-01, -1.59491450e-01,  2.99311578e-01,
          2.42751494e-01, -2.80126989e-01,  4.73398060e-01,
         -1.82507563e+00, -4.06455755e-01, -3.01502705e-01,
         -7.82541454e-01,  1.00253439e+00, -1.54386535e-01,
          6.34262525e-03,  1.16350245e+00, -8.27867687e-01,
          9.56638992e-01, -1.87372506e-01, -5.1