In [121]:
from transformers import BertTokenizer, BertForMaskedLM
from transformers import AdamW
import torch
from tqdm.auto import tqdm
from transformers import DataCollatorForLanguageModeling
import matplotlib.pyplot as plt
%matplotlib inline
import math

In [92]:
sikubert_model_path = "SIKU-BERT/sikubert"

tokenizer = BertTokenizer.from_pretrained(sikubert_model_path)
model = BertForMaskedLM.from_pretrained(sikubert_model_path)

Some weights of the model checkpoint at SIKU-BERT/sikubert were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [93]:
nom_vocab_file = 'vocab_Han_Nom.txt'
with open(nom_vocab_file, 'r', encoding='utf-8') as f:
    nom_vocab = [line.strip() for line in f.readlines()]

In [94]:
num_added_tokens = tokenizer.add_tokens(nom_vocab)
model.resize_token_embeddings(len(tokenizer))

Embedding(46119, 768, padding_idx=0)

In [134]:
sample_sentences = [
    "𠄎𠂤𡿨𡯨",
    "民浪屡奴群低",
    "戈䀡󰘚倍踈兮㐌仃"
]

for sentence in sample_sentences:
    tokens = tokenizer.tokenize(sentence)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    print(f"Câu: {sentence}")
    print(f"Tokens: {tokens}")
    print(f"Token IDs: {token_ids}")
    print("-" * 50)

Câu: 𠄎𠂤𡿨𡯨
Tokens: ['[UNK]', '[UNK]', '[UNK]', '[UNK]']
Token IDs: [100, 100, 100, 100]
--------------------------------------------------
Câu: 民浪屡奴群低
Tokens: ['民', '浪', '屡', '奴', '群', '低']
Token IDs: [3696, 3857, 2249, 1958, 5408, 856]
--------------------------------------------------
Câu: 戈䀡󰘚倍踈兮㐌仃
Tokens: ['戈', '䀡', '\U000f061a', '倍', '踈', '兮', '㐌', '仃']
Token IDs: [2762, 30277, 43993, 945, 24274, 1064, 29811, 786]
--------------------------------------------------


In [96]:
# tokenizer.save_pretrained(save_model_path)
# model.save_pretrained(save_model_path)

In [97]:
test_data = 'test_Dataset.txt'

with open(test_data, 'r', encoding='utf-8') as f:
    _data = [line.strip() for line in f.readlines()]
    print(_data)

['畧畑䀡傳西銘', '𡄎唭𠄩𡦂人情喓𠻗', '埃匕𠳺匕麻𦖑', '𡨺噒役畧苓𠽮身𡢐', '𤳆辰忠孝\U0002b735頭', '𡛔辰節行\U00031eb5句捞𨉓', '固𠊛扵郡東城', '修仁積\U00031429\U000318ad生昆䝨', '達𠸜\U00031eb5陸雲仙', '歲𣃣𠄩糁芸專斈行', '蹺柴𤍇史𥸷經', '𣎃㝵包𬋩功程劳刀', '文它起鳳滕雲', '武添叁畧六韜埃皮', '𠫾䀡\U00032247会科詩', '雲仙𠓨謝尊師𦋦𧗱', '閉𥹰\U0003223e圣預掑', '㐌鮮氣象吏吹精神', '𫢩它及会風雲', '𠊛匕埃拱立身貝尼', '志𭸓𢏑鴈边𩄲', '名碎㐌𤎜㗂柴屯賖', '\U0002b735𤳆𥪝𡎝𠊛些', '畧卢報補𡢐\U00031eb5顕荣', '尊師欺意論盘', '𡄎𥪝𢼂係科塲群賖', '𢵯𢈱\U000f0778敢呐𠚢', '吹柴傷伵㤕車𥪝𢚸', '𡢐油訴浽濁冲', '沛朱𠬠法底防𫉚身', '𣈙昆𠖈准風\U000315a6', '柴朱𠀧道符神\U000f0761蹺', '\U000f0778埋麻及𠯿𠨪', '𠖈滝供𠊡𨖲𰎉拱安', '尊師𠓨准後堂', '雲仙魚謹𢚸強生𪟽', '\U000f0778咍𨉓嗼役之', '\U000f0cb9師𠊛𠰺科期群賖', '𠬠\U00031eb5貝\U00031e7b𭛣茄', '𠄩\U00031eb5\U00031429泊𠀧\U00031eb5才踈', '闭𥹰\U000f0cba飭詩書', '会尼\U000f0778及群除会芇', '盛衰\U000f0778別\U0002b735牢', '之朋哙吏理芇朱明', '𠓨䜹朱訢事情', '丕𡢐𠦳淡登程買安', '尊師𡓮唉咀嘆', '𥄭𠚢畧按𧡊払阻\U0003137a', '哙浪萬里程途', '昆渚拮挭群\U0003137a𭛣之', '咍\U00031eb5昆唉胡𪟽', '柴盘𠬠𭛣科期群賖', '云仙𦖑呐連䜹', '小生庄別𬋟湄𠯿芇', '椿萱歲鶴㐌高', '吀柴排訢音毛朱詳', '尊師𦖑呐添傷', '𢩮𢬣\U00031702準前堂䀡𦝄', '因机謀事𠴍浪', '𭛣𠊛\U000f0778恪\U000f061a𦝄𨕭𡗶', '雖浪㵢泣尼匕', '欺𤎎欺訢欺潙欺苔', '𡢐昆拱𤎜𭛣𠁀', '呂\U00031eb5柴沛𣴓唎爫之', '𡢐昆𠄩𡦂科期', '斗星㐌𤎜紫

In [98]:
inputs = tokenizer(
    _data,
    return_tensors='pt',
    truncation=True,
    padding=True,
    max_length=216,
)

In [99]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [100]:
labels = inputs['input_ids'].clone()
labels[~masked_tensor] = -100  # Các vị trí không bị mask sẽ không được tính loss
inputs['labels'] = labels

In [101]:
inputs

{'input_ids': tensor([[  101, 21829,  4520,  ...,     0,     0,     0],
        [  101, 36720, 27247,  ...,   102,     0,     0],
        [  101,  1812,  1264,  ...,     0,     0,     0],
        ...,
        [  101,  2618, 25222,  ...,   102,     0,     0],
        [  101,  2016, 43561,  ...,     0,     0,     0],
        [  101,  6310, 31832,  ...,   102,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 0, 0]]), 'labels': tensor([[-100, -100, 4520,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -1

In [102]:
random_tensor = torch.rand(inputs['input_ids'].shape)

In [103]:
inputs['input_ids'].shape, random_tensor.shape

(torch.Size([134, 12]), torch.Size([134, 12]))

In [104]:
random_tensor

tensor([[7.3841e-01, 2.5501e-01, 5.5220e-01,  ..., 1.7502e-01, 5.8643e-01,
         9.1790e-01],
        [9.2138e-01, 8.1164e-04, 6.1720e-01,  ..., 4.6088e-01, 4.3235e-01,
         9.4866e-01],
        [7.2383e-01, 8.7527e-02, 9.5818e-01,  ..., 6.8351e-01, 4.4784e-01,
         1.0775e-01],
        ...,
        [2.9965e-02, 9.7720e-01, 8.9833e-01,  ..., 6.6679e-01, 1.8124e-01,
         2.0136e-01],
        [1.4638e-01, 1.1553e-01, 1.5428e-01,  ..., 3.8863e-01, 3.9502e-01,
         8.8746e-01],
        [9.6979e-01, 7.9201e-01, 3.7361e-01,  ..., 1.4298e-02, 8.1376e-01,
         2.3274e-01]])

In [105]:
masked_tensor = (random_tensor < 0.1) * (inputs['input_ids'] != 101) * (inputs['input_ids'] != 102) * (inputs['input_ids'] != 0)

In [106]:
masked_tensor

tensor([[False, False, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [107]:
nonzero_indexes = []
for i in range(len(masked_tensor)):
    nonzero_indexes.append(torch.flatten(masked_tensor[i].nonzero()).tolist())

In [108]:
nonzero_indexes

[[],
 [1],
 [1],
 [2],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [2, 6],
 [],
 [],
 [3],
 [1, 4],
 [2],
 [],
 [],
 [2],
 [3, 5],
 [1],
 [],
 [8],
 [],
 [],
 [3, 4],
 [2, 5],
 [],
 [8],
 [],
 [6],
 [],
 [1, 2],
 [2],
 [],
 [5],
 [],
 [],
 [1, 3],
 [],
 [2],
 [4],
 [],
 [1],
 [],
 [2],
 [],
 [6],
 [],
 [],
 [7],
 [1, 2],
 [],
 [],
 [1, 3],
 [4, 5],
 [],
 [6],
 [],
 [],
 [],
 [1, 3, 4],
 [1],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [8],
 [],
 [4, 5, 7, 8],
 [2],
 [],
 [1],
 [1],
 [4],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [6],
 [1],
 [3],
 [],
 [],
 [1],
 [5, 6],
 [1, 7],
 [],
 [5],
 [2],
 [1, 7],
 [4],
 [8],
 [],
 [],
 [],
 [5],
 [2],
 [3],
 [],
 [],
 [],
 [4],
 [],
 [1, 8],
 [5],
 [2, 4],
 [4, 6],
 [],
 [],
 [8],
 [],
 [],
 [3],
 [],
 [],
 [5],
 [2],
 [3, 4],
 [],
 [7],
 [],
 [3, 4]]

In [109]:
for i in range(len(inputs['input_ids'])):
    inputs['input_ids'][i, nonzero_indexes[i]] = 103

In [110]:
inputs['input_ids']

tensor([[  101, 21829,  4520,  ...,     0,     0,     0],
        [  101,   103, 27247,  ...,   102,     0,     0],
        [  101,   103,  1264,  ...,     0,     0,     0],
        ...,
        [  101,  2618, 25222,  ...,   102,     0,     0],
        [  101,  2016, 43561,  ...,     0,     0,     0],
        [  101,  6310, 31832,  ...,   102,     0,     0]])

In [111]:
class MaskFillingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.encodings['labels'][idx]
        }

In [112]:
dataset = MaskFillingDataset(inputs)

In [113]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [114]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(46119, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

# Setup Model Parameter

In [130]:
epochs = 4
optimizer = AdamW(model.parameters(), lr=1e-6)

In [131]:
train_losses = []
train_perplexities = []

In [132]:
for epoch in range(epochs):
    loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
    epoch_loss = 0
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        epoch_loss += loss.item()
        train_losses.append(loss.item())

        # Tính Perplexity
        perplexity = math.exp(loss.item())
        train_perplexities.append(perplexity)

        loop.set_postfix(loss=loss.item(), perplexity=perplexity)

    avg_epoch_loss = epoch_loss / len(dataloader)
    avg_epoch_ppl = math.exp(avg_epoch_loss)
    print(f"Epoch {epoch+1} Average Loss: {avg_epoch_loss:.4f} | Perplexity: {avg_epoch_ppl:.4f}")

Epoch 1/4:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 1 Average Loss: 9.5705 | Perplexity: 14336.2941


Epoch 2/4:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 2 Average Loss: 9.2213 | Perplexity: 10110.6465


Epoch 3/4:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 3 Average Loss: nan | Perplexity: nan


Epoch 4/4:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 4 Average Loss: 9.4095 | Perplexity: 12204.2437


In [116]:
plt.figure(figsize=(12, 6))

# Vẽ Loss
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss', color='blue')
plt.xlabel('Bước Huấn Luyện')
plt.ylabel('Loss')
plt.title('Biểu Đồ Loss Trong Quá Trình Huấn Luyện')
plt.legend()

# Vẽ Perplexity
plt.subplot(1, 2, 2)
plt.plot(train_perplexities, label='Training Perplexity', color='red')
plt.xlabel('Bước Huấn Luyện')
plt.ylabel('Perplexity')
plt.title('Biểu Đồ Perplexity Trong Quá Trình Huấn Luyện')
plt.legend()

plt.tight_layout()
plt.show()