# Word piece tokenizer example

This example shows how to use a more advanced tokenizer based on word pieces. 

The tokenizer finds subsets of words that can be used as tokens.

In [2]:
!pip install -q tokenizers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
from tokenizers import BertWordPieceTokenizer

# initialize the actual tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True
)

In [3]:
# now, we need to prepare a dataset
# in our case, let's just read a dataset that is a code of a program

# in this example, I use the file from an open source component - Azure NetX
# the actual part is not that important, as long as we have a set of 
# tokens that we want to analyze
paths = '/content/drive/MyDrive/ds/cs_dos/nx_icmp_checksum_compute.c'

In [17]:
# and train the tokenizer based on the text
tokenizer.train(files=paths, 
                vocab_size=30_000, 
                min_frequency=1,
                limit_alphabet=1000, 
                wordpieces_prefix='##',
                special_tokens=['[PAD', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])

In [21]:
tokenizer.get_vocab()

{'##ll': 183,
 'disable': 326,
 'al': 263,
 '##cket': 90,
 '##s': 65,
 'computed': 484,
 'is': 178,
 '##ure': 309,
 '09': 357,
 'po': 151,
 'so': 400,
 'upper': 345,
 '##ic': 136,
 '##sion': 296,
 '##2': 84,
 '##it': 152,
 'lic': 197,
 '##n': 68,
 'all': 366,
 '##v': 80,
 'wor': 108,
 'uchar': 226,
 '##6': 82,
 '9': 24,
 'cal': 267,
 'be': 367,
 'aka': 505,
 'previous': 546,
 'ma': 272,
 'date': 376,
 'boundary': 506,
 'pa': 89,
 'case': 374,
 'word': 109,
 '##ter': 141,
 '##xt': 186,
 'ul': 127,
 'det': 196,
 'output': 521,
 '##te': 115,
 'shi': 233,
 'swap': 321,
 '##ig': 239,
 '##tocol': 476,
 'bit': 172,
 '##ly': 439,
 'swa': 202,
 'move': 253,
 'else': 336,
 'generates': 548,
 'car': 372,
 'new': 473,
 'alignment': 333,
 'include': 261,
 'z': 53,
 '2': 20,
 'set': 232,
 'calcul': 493,
 '##ort': 125,
 'ptr': 105,
 'az': 264,
 '##wer': 187,
 '##ur': 111,
 '##cre': 300,
 '##cl': 243,
 'pro': 393,
 '##p': 59,
 'hi': 383,
 'found': 380,
 'lower': 193,
 'do': 375,
 'pi': 276,
 '##ent': 

In [4]:
# now, let's tokenize a simple C program: 
strCProgram = '''
int main(int argc, void **argc)
{
  printf("%s", "Hello World\n");
  return 0; 
}
'''

In [19]:
# now, let's see how the tokenizer works
# we invoke it based on the program above
tokenizedText = tokenizer.encode(strCProgram)

# and explore what this tokenized text object contains
tokenizedText

Encoding(num_tokens=50, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [20]:
# and we can also see how the input program was actually tokenized
tokenizedText.tokens

['in',
 '##t',
 'ma',
 '##in',
 '(',
 'in',
 '##t',
 'a',
 '##r',
 '##g',
 '##c',
 ',',
 'v',
 '##o',
 '##i',
 '##d',
 '*',
 '*',
 'a',
 '##r',
 '##g',
 '##c',
 ')',
 '{',
 'p',
 '##ri',
 '##n',
 '##t',
 '##f',
 '(',
 '"',
 '[UNK]',
 's',
 '"',
 ',',
 '"',
 'h',
 '##e',
 '##ll',
 '##o',
 'wor',
 '##l',
 '##d',
 '"',
 ')',
 ';',
 'return',
 '0',
 ';',
 '}']

In [22]:
tokenizedText.ids

[110,
 57,
 272,
 104,
 10,
 110,
 57,
 30,
 61,
 63,
 69,
 14,
 49,
 58,
 62,
 72,
 12,
 12,
 30,
 61,
 63,
 69,
 11,
 54,
 44,
 204,
 68,
 57,
 76,
 10,
 6,
 1,
 46,
 6,
 14,
 6,
 37,
 66,
 183,
 58,
 108,
 70,
 72,
 6,
 11,
 26,
 343,
 18,
 26,
 56]

## BPE - Byte Pair Encoding

In this part of the workbook, we use the same program text, but we train another tokenizer.

In [5]:
from tokenizers import Tokenizer
from tokenizers.models import BPE

# First, we instantiate the tokenizer 
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

In [7]:
# in the next step, we instantiate a trainer
# which is hugging face's way of training elements
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()

In [10]:
# and finally, we train the tokenizer
# as the training set, we use the very same file as previously
tokenizer.train([paths], trainer)

In [11]:
# we can print the vocabulary
tokenizer.get_vocab()

{'be': 500,
 'g_temp': 201,
 '>': 28,
 'ICMP': 182,
 'll': 235,
 'w': 74,
 '[PAD]': 3,
 'Software': 490,
 'ment': 286,
 'UCHA': 223,
 'ines': 589,
 '********************************': 105,
 'tw': 547,
 'DESCR': 405,
 'EU': 464,
 'wh': 371,
 'these': 378,
 '/*': 85,
 'IP': 325,
 'End': 322,
 'computed': 595,
 '_': 53,
 'Term': 492,
 'Corporation': 436,
 'NX_CHA': 383,
 'ry': 538,
 '=': 27,
 'st': 191,
 'FU': 465,
 'ET': 320,
 'comment': 587,
 'NX_DISABLE_ICMP': 428,
 'K': 38,
 'ed': 134,
 'cre': 349,
 'ter': 150,
 '*(': 444,
 'define': 602,
 '(': 10,
 'current_packet': 168,
 'nx_api': 594,
 'Mic': 220,
 'lied': 522,
 'padd': 536,
 'u': 72,
 'compute': 252,
 'logic': 403,
 '_ptr': 104,
 'Mu': 475,
 'RCE_CODE': 641,
 'SCR': 338,
 'OUTPUT': 638,
 'NAME': 637,
 'nd': 203,
 'eturn': 426,
 'CE_': 459,
 'CK': 215,
 'are': 228,
 'ointer': 170,
 'to': 136,
 'C': 31,
 'operation': 601,
 'Cor': 318,
 'etur': 375,
 '1': 19,
 '9': 24,
 'X': 50,
 'text': 556,
 'NX_SOURCE_CODE': 667,
 'need': 394,
 'W

In [13]:
# and we can encode the text
tokenizedText = tokenizer.encode(strCProgram)

tokenizedText.tokens

['in',
 't',
 'm',
 'ain',
 '(',
 'in',
 't',
 'ar',
 'g',
 'c',
 ',',
 'v',
 'o',
 'i',
 'd',
 '**',
 'ar',
 'g',
 'c',
 ')',
 '{',
 'p',
 'r',
 'in',
 't',
 'f',
 '(',
 '"',
 '[UNK]',
 's',
 '"',
 ',',
 '"',
 'H',
 'e',
 'll',
 'o',
 'W',
 'or',
 'l',
 'd',
 '"',
 ');',
 'return',
 '0',
 ';',
 '}']

In [14]:
tokenizedText.ids

[95,
 71,
 65,
 499,
 10,
 95,
 71,
 227,
 60,
 56,
 14,
 73,
 67,
 62,
 57,
 81,
 227,
 60,
 56,
 11,
 78,
 68,
 69,
 95,
 71,
 59,
 10,
 6,
 0,
 70,
 6,
 14,
 6,
 36,
 58,
 235,
 67,
 49,
 97,
 64,
 57,
 6,
 124,
 540,
 18,
 26,
 80]