# tokenization

In [None]:
#virtual environment
'''
conda activate py310
jupyter notebook
'''

## Training a Tokenizer
suppose sequence temperature input data are available

In [2]:
output_name = 'KantaiBERT'

In [3]:
%%time
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob(f"../data/temperature_mask/*.txt")]
print('text files in current directory:', paths)
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>",]
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=special_tokens)

text files in current directory: ['../data/temperature_mask/temperature_mask.txt', '../data/temperature_mask/uniprot_sprot_mask.txt']



CPU times: user 1h 43min 14s, sys: 3min 10s, total: 1h 46min 24s
Wall time: 14min 1s


In [4]:
# test tokenizer
example_seq = "RTIRVLTTYNKTGYFIHKGVQRGVTYDAFIQVEKRLN is 22"
tokenizer.encode(example_seq).tokens

['RT',
 'IRV',
 'LT',
 'TYN',
 'KT',
 'GYF',
 'IH',
 'KGV',
 'QR',
 'GVT',
 'YDAF',
 'IQVE',
 'KRLN',
 'Ġis',
 'Ġ22']

In [5]:
# add start and end tokens to fit the BERT model
from tokenizers.processors import BertProcessing

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [6]:
tokenizer.encode(example_seq)

Encoding(num_tokens=17, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [7]:
tokenizer.encode(example_seq).tokens

['<s>',
 'RT',
 'IRV',
 'LT',
 'TYN',
 'KT',
 'GYF',
 'IH',
 'KGV',
 'QR',
 'GVT',
 'YDAF',
 'IQVE',
 'KRLN',
 'Ġis',
 'Ġ22',
 '</s>']

In [8]:
tokenizer

Tokenizer(vocabulary_size=52000, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)

In [12]:
# save tokenizer
import os

#Usually, the dir temperature_mask should be created by prepare_data.ipynb previously
token_dir = f"../{output_name}"
if not os.path.isdir(token_dir):
  os.makedirs(token_dir)
else:
    print(f'skip creation of {token_dir}')

# save object tokenizer as vocab.json and merges.txt
tokenizer.save_model(token_dir)

skip creation of ../KantaiBERT


['../KantaiBERT/vocab.json', '../KantaiBERT/merges.txt']