In [None]:
# !pip install -q Korpora
# !pip install transformers
# !pip install python-mecab-kor

In [22]:
from tqdm import tqdm
from Korpora import Korpora
from mecab import MeCab

from transformers import PreTrainedTokenizerFast, AutoTokenizer

from custom_tokenizer import SentencePieceCustomTokenizer

In [1]:
Korpora.fetch('kcbert', root_dir='.')

[kcbert] download kcbert-train.tar.gzaa: 100%|██████████| 2.10G/2.10G [01:46<00:00, 19.6MB/s]   
[kcbert] download kcbert-train.tar.gzab: 100%|██████████| 2.10G/2.10G [01:00<00:00, 34.9MB/s]   
[kcbert] download kcbert-train.tar.gzac: 671MB [00:20, 32.6MB/s]                              


Unzip tar. It needs a few minutes ... ./._20190101_20200611_v2.txt
./20190101_20200611_v2.txt
done


In [2]:
check_data = []
with open('kcbert/20190101_20200611_v2.txt', 'r') as f:
  for i in range(10):
    check_data.append(f.readline())

In [3]:
DEMO_MODE = True 
if DEMO_MODE:
    CORPUS_SIZE = 5000000 #@param {type:"number"}
    !(head -n $CORPUS_SIZE kcbert/20190101_20200611_v2.txt) > dataset.txt  
else:
    !mv kcbert/20190101_20200611_v2.txt dataset.txt

In [4]:
!ls -lh dataset.txt

-rw-r--r-- 1 root root 693M Oct  7 07:01 dataset.txt


In [5]:
!mkdir ./shards
!split -a 4 -l 256000 -d dataset.txt ./shards/shard_
!ls ./shards/

shard_0000  shard_0004	shard_0008  shard_0012	shard_0016
shard_0001  shard_0005	shard_0009  shard_0013	shard_0017
shard_0002  shard_0006	shard_0010  shard_0014	shard_0018
shard_0003  shard_0007	shard_0011  shard_0015	shard_0019


In [12]:
with open('dataset.txt', "r") as f:
    data = f.read()
    
origin_texts = data.strip().split('\n')

In [13]:
len(origin_texts)

5000000

In [20]:
mecab = MeCab()

with open("new_dataset.txt", "w") as f:
    for text in tqdm(origin_texts):
        text = ' '.join(mecab.morphs(text))
        f.write(text+'\n')

100%|██████████| 5000000/5000000 [23:03<00:00, 3614.37it/s]


In [23]:
special_tokens = ["[pad]", "[bos]", "[eos]", "[unk]"]

special_token_dict = {}

for i in range(len(special_tokens)):
    special_token_dict[special_tokens[i][1:-1]] = {
        "id" : i,
        "token" : special_tokens[i]
    }

special_token_dict

{'pad': {'id': 0, 'token': '[pad]'},
 'bos': {'id': 1, 'token': '[bos]'},
 'eos': {'id': 2, 'token': '[eos]'},
 'unk': {'id': 3, 'token': '[unk]'}}

In [24]:
TemplateProcessing_dict = {
    "single" : "{} $A {}".format(special_token_dict['bos']['token'], special_token_dict['eos']['token']),
    "pair" : "{} $A {} $B:1 {}:1".format(special_token_dict['bos']['token'],special_token_dict['eos']['token'], special_token_dict['eos']['token']),
    "special_tokens" : [
        (special_token_dict['bos']['token'], special_token_dict['bos']['id']),
        (special_token_dict['eos']['token'], special_token_dict['eos']['id']),
    ]
}

In [25]:
# 'BPE' 
# 'Unigram' 
# 'WordLevel'
# 'WordPiece'


norm_keys = ['Nmt', 'NFKC', 'Replace', 'Lowercase']

sp_uni_tokenizer = SentencePieceCustomTokenizer('WordPiece', 
                                                norm_keys, 
                                                special_token_dict,
                                                TemplateProcessing_dict,
                                                add_prefix_space=True)

In [26]:
%%time

file = "new_dataset.txt"

sp_uni_tokenizer.train(
    files=[file],
    vocab_size=30000,   # vocab size 를 지정해줄 수 있습니다.
    show_progress=True
)




CPU times: user 11min 51s, sys: 5.81 s, total: 11min 56s
Wall time: 3min 5s


In [31]:
tokenizer = PreTrainedTokenizerFast(tokenizer_object=sp_uni_tokenizer)

In [45]:
tokenizer.add_special_tokens({'pad_token': special_token_dict['pad']['token']})
tokenizer.bos_token_id = special_token_dict['bos']['id']
tokenizer.cls_token_id = special_token_dict['bos']['id']
tokenizer.eos_token_id = special_token_dict['eos']['id']
tokenizer.sep_token_id = special_token_dict['eos']['id']

In [46]:
tokenizer_path = 'custom_tokenizer'

tokenizer.save_pretrained(tokenizer_path)

('custom_tokenizer/tokenizer_config.json',
 'custom_tokenizer/special_tokens_map.json',
 'custom_tokenizer/tokenizer.json')

In [47]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [48]:
test_texts = [
    '3시 30분이 걸렸다',
    '3분이서 가세요',
    '오늘의 순서는 누구인가요?',
    '앤드류가필드가 말했다.',
    '3시 30분',
]

for text in test_texts: 
  print(tokenizer.encode(text))
  print(tokenizer.decode(tokenizer.encode(text)))
  print(tokenizer.tokenize(text))
  print('--------------------')

[1, 142, 21, 5991, 142, 21, 18, 5306, 9467, 21598, 9497, 2]
[bos] 3시 30분##이 걸렸##다[eos]
['▁', '3', '시', '▁', '3', '0', '분', '##이', '▁걸렸', '##다']
--------------------
[1, 142, 21, 5306, 9467, 9660, 18842, 19600, 2]
[bos] 3분##이##서 가##세요[eos]
['▁', '3', '분', '##이', '##서', '▁가', '##세요']
--------------------
[1, 19687, 9572, 24285, 9508, 19379, 9495, 9525, 9494, 31, 2]
[bos] 오늘##의 순서##는 누구##인##가##요?[eos]
['▁오늘', '##의', '▁순서', '##는', '▁누구', '##인', '##가', '##요', '?']
--------------------
[1, 26236, 9515, 10015, 9525, 9899, 9515, 9525, 18882, 10612, 9497, 16, 2]
[bos] 앤##드##류##가##필##드##가 말##했##다.[eos]
['▁앤', '##드', '##류', '##가', '##필', '##드', '##가', '▁말', '##했', '##다', '.']
--------------------
[1, 142, 21, 5991, 142, 21, 18, 5306, 2]
[bos] 3시 30분[eos]
['▁', '3', '시', '▁', '3', '0', '분']
--------------------


In [49]:
tokenizer.pad_token_id

0

In [50]:
tokenizer.bos_token_id, tokenizer.cls_token_id

(1, 1)

In [51]:
tokenizer.eos_token_id, tokenizer.sep_token_id

(2, 2)

In [None]:
# 그외 tokenizer
# https://github.com/huggingface/tokenizers/tree/5f6e9784526a4cd5e4f6dcdcc045cdceba5463e1/bindings/python/py_src/tokenizers/implementations