## Load Model và thêm những tokens chữ Nôm vào

In [2]:
from transformers import (BertTokenizer, BertForMaskedLM)
# device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
device = 'cpu'

ancient_chinese_tokenizer = BertTokenizer.from_pretrained('Jihuai/bert-ancient-chinese') # fast_tokenizer=False
ancient_chinese_model = BertForMaskedLM.from_pretrained('Jihuai/bert-ancient-chinese')

  from .autonotebook import tqdm as notebook_tqdm
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [119]:
ancient_chinese_model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(38208, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [120]:
ancient_chinese_tokenizer

BertTokenizer(name_or_path='Jihuai/bert-ancient-chinese', vocab_size=38208, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [121]:
# Trước khi thêm tokens mới thì một số ký tự chữ Nôm sẽ ko encode được
test_text = "󱪺𨰈󱮷󰔃󰭋鸦鸨"
""" encoded = ancient_chinese_tokenizer(
    test_text,
    return_tensors="pt",
    add_special_tokens=True
) """
encoding = ancient_chinese_tokenizer.encode(test_text, add_special_tokens=True)  
print(encoding)
decode = ancient_chinese_tokenizer.decode(encoding)
print(decode)

[101, 100, 7887, 7888, 102]
[CLS] [UNK] 鸦 鸨 [SEP]


### Tạo tập vocab_nom bao gồm các chữ trong vocab ban đầu và thêm cả các chữ Nôm 

In [3]:
vocab_nom = []
with open("vocab_Han_Nom.txt","r", encoding="utf-8") as f:
    data = f.read().splitlines()
    for i in data:
        vocab_nom.append(i)

In [4]:
vocab_model = ancient_chinese_tokenizer.get_vocab().keys()
vocab_nom = list(vocab_model)+list(set(vocab_nom) - set(vocab_model))
print(len(vocab_nom))

50424


### Thêm tokens mới vào tokenizer

In [124]:
# Thêm token mới
num_added_tokens = ancient_chinese_tokenizer.add_tokens(vocab_nom)
# In số token đã thêm thành công
print(f"Number of tokens added: {num_added_tokens}")

Number of tokens added: 12212


In [125]:
# Kiểm tra encode và decode
#test_text = "󱪺𨰈󱮷󰔃󰭋鸦鸨"
test_text = "耨 於 廛 伽  𨴦 茹"
#encoded = ancient_chinese_tokenizer.encode(test_text, add_special_tokens=True)
#decoded = ancient_chinese_tokenizer.decode(encoded, add_special_tokens=True)

encoded = ancient_chinese_tokenizer.encode(test_text, add_special_tokens=True)
decoded = ancient_chinese_tokenizer.decode(encoded, add_special_tokens=True)
print("Encoded IDs:", encoded)
print("Decoded Text:", decoded)


Encoded IDs: [101, 5454, 3176, 22735, 850, 29685, 49808, 5765, 102]
Decoded Text: [CLS] 耨 於 廛 伽  𨴦 茹 [SEP]


In [126]:
print(ancient_chinese_tokenizer.convert_tokens_to_ids(["焕", "", "焗", "焘", "焙"]))

[4185, 29685, 4187, 4188, 4189]


In [127]:
ancient_chinese_model.resize_token_embeddings(len(ancient_chinese_tokenizer))

Embedding(50420, 768, padding_idx=0)

In [128]:
ancient_chinese_tokenizer.save_pretrained("NomBertTokenizer")

('NomBertTokenizer\\tokenizer_config.json',
 'NomBertTokenizer\\special_tokens_map.json',
 'NomBertTokenizer\\vocab.txt',
 'NomBertTokenizer\\added_tokens.json')

## Kiểm tra tokenizer mới có thể encode và decode đúng hay không

In [1]:
from transformers import (BertForMaskedLM, BertTokenizer)
# 1. Load the custom tokenizer
tokenizer = BertTokenizer.from_pretrained("NomBertTokenizerv4")

# 2. Load the pre-trained model and resize embeddings
model = BertForMaskedLM.from_pretrained("Jihuai/bert-ancient-chinese")
model.resize_token_embeddings(len(tokenizer))  # Adjust model for the new tokenizer

  from .autonotebook import tqdm as notebook_tqdm
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50420, 768, padding_idx=0)

In [2]:
print(type(tokenizer))

<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>


In [3]:

with open("cleaned_data.txt", "r", encoding="utf-8") as f_combined:
    clean_data = f_combined.read().splitlines()
def preprocess_text(text):
    return text.replace(" ", "").replace("\n", "").replace("\t", "")
for idx,val in enumerate(clean_data[10000:10100]):
    #print("text: ",val)
    try:
        encoded = tokenizer.encode(val)
    except:
        print("Wrong tokenization")
        print(idx, val)
    decoded = tokenizer.decode(encoded, skip_special_tokens=True)
    if preprocess_text(val) != preprocess_text(decoded):
        print("Wrong decoding. text: ", val, "decoded: ", decoded,"index: ", idx)
    #print("decode: ",val)

Wrong tokenization
15 𠇮 猉 肝 𧑂 典 㙴 停 真
Wrong decoding. text:  𠇮 猉 肝 𧑂 典 㙴 停 真 decoded:  昆 侯 㖫 𠻴 𢫕 𨖲 index:  15


In [6]:

# Step 2: Encode the input text using the tokenizer
#test_text = "𠇮 猉 肝 𧑂 典 㙴 停 真"
test_text = "耨 於 廛 伽  覶 茹"
encoding = tokenizer.encode(test_text, add_special_tokens=True)
print(type(tokenizer))

# Print encoded tokens and IDs
print("\nEncoded Tokens:", tokenizer.convert_ids_to_tokens(encoding))
print("Encoded IDs:", encoding)

# Step 3: Decode back to text
decoded_text = tokenizer.decode(encoding, add_special_tokens=True)
print("\nDecoded Text:", decoded_text)

<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>

Encoded Tokens: ['[CLS]', '耨', '於', '廛', '伽', '\ue1d5', '覶', '茹', '[SEP]']
Encoded IDs: [101, 5454, 3176, 22735, 850, 29685, 29680, 5765, 102]

Decoded Text: [CLS] 耨 於 廛 伽  覶 茹 [SEP]
