## This example demonstrates merging two trained tokenizers.
### The base LLM is TinyLlama, which has been integrated with a Japanese LLM developed by Rakuten.

In [10]:
!pip install -q transformers==4.44.0
!pip install -q sentencepiece==0.2.0
!pip install -q datasets==2.20.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [11]:
from transformers import AutoTokenizer

# Base LLM
model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tiny_tok = AutoTokenizer.from_pretrained(model, use_fast=True)
tiny_tok.save_pretrained('./tiny_tokenizer/')
print(tiny_tok.is_fast)

True


In [12]:
# Japanese LLM
model = "Rakuten/RakutenAI-7B"
rak_tok = AutoTokenizer.from_pretrained(model, use_fast=True)
rak_tok.save_pretrained('./rak_tokenizer/')
print(rak_tok.is_fast)

True


In [13]:
import os
import json


output_dir = './merged_tokenizer'
if not os.path.exists(output_dir): # Make the directory if necessary
    os.makedirs(output_dir)

# Read vocab files
old_vocab = json.load(open(os.path.join('./tiny_tokenizer', 'tokenizer.json')))["model"]["vocab"]
new_vocab = json.load(open(os.path.join('./rak_tokenizer', 'tokenizer.json')))["model"]["vocab"]
next_id = old_vocab[max(old_vocab, key=lambda x: int(old_vocab[x]))] + 1
print("old_vocab:", len(old_vocab))
print("new_vocab:", len(new_vocab))
print("next_id:", next_id)

# Merge vocabs
for word in new_vocab.keys():
    if word not in old_vocab.keys():
        old_vocab[word] = next_id 
        next_id += 1
print("next_id:", next_id)
print("old_vocab:", len(old_vocab))
 
# Save vocabs
old_vocab_json = json.load(open(os.path.join('./tiny_tokenizer', 'tokenizer.json')))
old_vocab_json["model"]["vocab"] = old_vocab
print(len(old_vocab_json["model"]["vocab"]))
with open(os.path.join(output_dir , 'tokenizer.json'), 'w') as fp:
    json.dump(old_vocab_json, fp, ensure_ascii=False)
print(len(old_vocab_json["model"]["vocab"]))
    
# Read merge rules
old_merge = json.load(open(os.path.join('./tiny_tokenizer', 'tokenizer.json')))["model"]["merges"]
new_merge = json.load(open(os.path.join('./rak_tokenizer', 'tokenizer.json')))["model"]["merges"]
print("old_merge:", len(old_merge))
print("new_merge:", len(new_merge))
 
# Add new merge rules, the order of merge rule has to be maintained
old_merge_set = set(old_merge)
combined_merge = old_merge + [merge_rule for merge_rule in new_merge if merge_rule not in old_merge_set]
print("combined_merge:", len(combined_merge))
 
# Save merge rules
old_vocab_json = json.load(open(os.path.join(output_dir, 'tokenizer.json')))
old_vocab_json["model"]["merges"] = combined_merge
with open(os.path.join(output_dir , 'tokenizer.json'), 'w') as fp:
    json.dump(old_vocab_json, fp, ensure_ascii=False, indent=4)

print("<<<=====>>>")
mrg_vocab = json.load(open(os.path.join(output_dir, 'tokenizer.json')))["model"]["vocab"]
mrg_merge = json.load(open(os.path.join(output_dir, 'tokenizer.json')))["model"]["merges"]
print("mrg_vocab:", len(mrg_vocab))
print("mrg_merge:", len(mrg_merge))

old_vocab: 32000
new_vocab: 48000
next_id: 32000
next_id: 55796
old_vocab: 55796
55796
55796
old_merge: 61249
new_merge: 80520
combined_merge: 97852
<<<=====>>>
mrg_vocab: 55796
mrg_merge: 97852


In [14]:
import shutil

# Copying the special_tokens_map and tokenizer_config to the merged tokenizer is necessary to ensure it can be properly loaded.
shutil.copyfile("./tiny_tokenizer/special_tokens_map.json", "./merged_tokenizer/special_tokens_map.json")
shutil.copyfile("./tiny_tokenizer/tokenizer_config.json", "./merged_tokenizer/tokenizer_config.json")

new_tok = AutoTokenizer.from_pretrained(output_dir, use_fast=True)

In [15]:
# Testing
text = ["犬", "自然言語処理とは何か"]

for txt in text:
    print(tiny_tok.encode(txt, add_special_tokens=False, return_tensors="pt"))
    print(rak_tok.encode(txt, add_special_tokens=False, return_tensors="pt"))
    print(new_tok.encode(txt, add_special_tokens=False, return_tensors="pt"))
    print("<<<=====>>>")

tensor([[29871,   234,   141,   175]])
tensor([[28705, 35423]])
tensor([[29871, 43223]])
<<<=====>>>
tensor([[29871, 30688, 31516, 31243, 30968,   232,   138,   169, 30687, 30364,
         30449, 31502, 30412]])
tensor([[28705, 34474, 34646, 35981, 32149, 46772]])
tensor([[29871, 42276, 42448, 43779, 39965, 54568]])
<<<=====>>>


### After this step, we need to modify the model since the number of vocabulary has been changed in the tokenizer.
#### Refer to model_modification_new_tokenizer.ipynb for model modification.