## This example demonstrates training a new monolingual tokenizer in a new language.

In [33]:
!pip install -q transformers==4.44.0
!pip install -q sentencepiece
!pip install -q datasets==2.20.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [34]:
from transformers import AutoTokenizer

# Load the base LLM tokenizer
model = "TinyLlama/TinyLlama_v1.1"
old_tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
old_tokenizer.save_pretrained('./old_tokenizer/')

('./old_tokenizer/tokenizer_config.json',
 './old_tokenizer/special_tokens_map.json',
 './old_tokenizer/tokenizer.model',
 './old_tokenizer/added_tokens.json',
 './old_tokenizer/tokenizer.json')

In [35]:
from datasets import load_dataset

# Get the dataset in target language (Japenese here)
dataset = load_dataset("wikimedia/wikipedia", "20231101.ja")

In [36]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 1389467
    })
})

In [37]:
from random import randint

# Sampling training data since 30% of the data would be enough for training a tokenizer.
random_rows = [randint(0, dataset["train"].num_rows-1) for _ in range(int(dataset["train"].num_rows/3))]
train_dataset = dataset["train"].select(random_rows)

In [38]:
train_dataset

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 463155
})

In [39]:
# Keep the text column only and remove other columns
train_dataset = train_dataset.remove_columns([                                                             
    col for col in train_dataset.column_names if col != "text"                                        
])

In [40]:
train_dataset

Dataset({
    features: ['text'],
    num_rows: 463155
})

In [41]:
# Dataloader for training
steps = 1000

def get_training_corpus(train_dataset):
    dataset = train_dataset
    for start_idx in range(0, len(dataset), steps):
        samples = dataset[start_idx : start_idx + steps]
        yield samples["text"]
        
training_corpus = get_training_corpus(train_dataset)

In [42]:
%%time

# Training the new tokenizer with base LLM's tokenizer config
# Vocab size is a hyperparameter, and it is the maximum number of vocabulary for the new language
new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size=1000, show_progress=True)




CPU times: user 22min 26s, sys: 18.6 s, total: 22min 44s
Wall time: 4min 4s


In [14]:
# Saving the new monolingual tokenizer for target language
new_tokenizer.save_pretrained('./new_monolingual_tokenizer/')

('./new_monolingual_tokenizer/tokenizer_config.json',
 './new_monolingual_tokenizer/special_tokens_map.json',
 './new_monolingual_tokenizer/tokenizer.model',
 './new_monolingual_tokenizer/added_tokens.json',
 './new_monolingual_tokenizer/tokenizer.json')

In [15]:
# Testing

text = "犬"
# text = "自然言語処理とは何か"

print(new_tokenizer.encode(text, add_special_tokens=False, return_tensors="pt"))
print("==========")
print(old_tokenizer.encode(text, add_special_tokens=False, return_tensors="pt"))

tensor([[2482, 9341]])
tensor([[29871,   234,   141,   175]])


### After this step, the new monolingual tokenizer should be merged with the base tokenizer, followed by the necessary model modifications.
#### Refer to new_monolingual_tokenizer.ipynb for merging the tokenizers.
#### Refer to model_modification_new_tokenizer.ipynb for the model modifications.