# Byte Pair Encoding

This workbook demonstrates how to use the BPE tokenizer. It is based on the HuggingFace BPE implementation. 

The original code, although with a different example, can be found in this tutorial: https://huggingface.co/docs/tokenizers/quicktour

In [2]:
!pip install -q tokenizers


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/7.8 MB[0m [31m6.6 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/7.8 MB[0m [31m23.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m5.2/7.8 MB[0m [31m51.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m6.3/7.8 MB[0m [31m50.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.8/7.8 MB[0m [31m48.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# in this example we use the tokenizers 
# from the HuggingFace library
from tokenizers import Tokenizer
from tokenizers.models import BPE

# we instantiate the tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

In [4]:
from tokenizers.trainers import BpeTrainer

# here we instantiate the trainer
# which is a specific class that will manage
# the training process of the tokenizer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [5]:
from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()

In [9]:
# now, we need to prepare a dataset
# in our case, let's just read a dataset that is a code of a program

# in this example, I use the file from an open source component - Azure NetX
# the actual part is not that important, as long as we have a set of 
# tokens that we want to analyze
paths = ['/content/drive/MyDrive/ds/cs_dos/nx_icmp_checksum_compute.c']

In [11]:
# finally, we are ready to train the tokenizer
tokenizer.train(paths, trainer)

In [12]:
tokenizer.get_vocab()

{'only': 565,
 'he': 87,
 'RTOS': 416,
 'DE': 266,
 'CH': 154,
 'a': 54,
 'ps': 534,
 'will': 372,
 'NX_SHIFT_BY': 311,
 'O': 42,
 'carry': 651,
 'NX_DISABLE_ICMP_TX_CHECKSUM': 620,
 'ol': 529,
 'N': 41,
 'ste': 543,
 '_16': 174,
 'cksu': 101,
 'ck': 86,
 'OS': 330,
 'nx_packet_append_ptr': 246,
 'USH': 173,
 'al': 199,
 '20': 211,
 '+': 13,
 'rosoft': 249,
 'Mask': 634,
 'clude': 309,
 'right': 424,
 '5': 22,
 'ed': 134,
 'Multi': 635,
 'NX_LO': 243,
 'ECKSU': 413,
 'essage': 409,
 'CK': 215,
 '>>': 264,
 'NX_SHIFT_BY_16': 312,
 'per': 362,
 'lo': 236,
 'Azure': 410,
 'If': 327,
 'PORTABLE': 640,
 'length': 123,
 '******************************************/': 165,
 'h': 61,
 '_BY': 275,
 'computes': 596,
 'WER_16_MAS': 257,
 'CTION': 461,
 'ion': 232,
 'Initialize': 612,
 '_nx_icmp': 345,
 'Protoco': 639,
 '//': 446,
 '}': 80,
 'result': 568,
 'if': 126,
 'str': 544,
 'system': 679,
 'pend_ptr': 194,
 'ser': 545,
 'E_': 267,
 'ULONG': 139,
 'lt': 357,
 'endian': 387,
 'ial': 355,
 'gi

In [13]:
# now, let's tokenize a simple C program: 
strCProgram = '''
int main(int argc, void **argc)
{
  printf("%s", "Hello World\n");
  return 0; 
}
'''

In [14]:
# now, let's see how the tokenizer works
# we invoke it based on the program above
tokenizedText = tokenizer.encode(strCProgram)

# and explore what this tokenized text object contains
tokenizedText

Encoding(num_tokens=47, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [16]:
tokenizedText.tokens

['in',
 't',
 'm',
 'ain',
 '(',
 'in',
 't',
 'ar',
 'g',
 'c',
 ',',
 'v',
 'o',
 'i',
 'd',
 '**',
 'ar',
 'g',
 'c',
 ')',
 '{',
 'p',
 'r',
 'in',
 't',
 'f',
 '(',
 '"',
 '[UNK]',
 's',
 '"',
 ',',
 '"',
 'H',
 'e',
 'll',
 'o',
 'W',
 'or',
 'l',
 'd',
 '"',
 ');',
 'return',
 '0',
 ';',
 '}']