# Sentence piece tokenizer

This notebook is a demonstration of how to use the sentence piece tokenizer. The notebook is based on the official documentation notebook available here: https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb

The original examples is under Apache License 2.0. The changes in this workbook include:
1) different training set, as in the rest of the book
2) different example, as in the rest of the book

In [2]:
# installing the sentence piece tokenizer
!pip install -q sentencepiece

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.3 MB[0m [31m6.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.3/1.3 MB[0m [31m18.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import sentencepiece as spm

# this statement trains the tokenizer
spm.SentencePieceTrainer.train('--input="/content/drive/MyDrive/ds/cs_dos/nx_icmp_checksum_compute.c" --model_prefix=m --vocab_size=200')

# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load('m.model')

True

In [5]:
strCProgram = '''
int main(int argc, void **argc)
{
  printf("%s", "Hello World\n");
  return 0; 
}
'''

In [6]:
# encode: text => id
print(sp.encode_as_pieces(strCProgram))
print(sp.encode_as_ids(strCProgram))

['▁in', 't', '▁', 'm', 'a', 'in', '(', 'in', 't', '▁a', 'r', 'g', 'c', ',', '▁', 'v', 'o', 'i', 'd', '▁*', '*', 'a', 'r', 'g', 'c', ')', '▁', '{', '▁', 'p', 'r', 'in', 't', 'f', '(', '"', '%', 's', '"', ',', '▁', '"', 'H', 'e', 'll', 'o', '▁', 'W', 'o', 'r', 'l', 'd', '▁', '"', ')', ';', '▁', 're', 't', 'u', 'r', 'n', '▁0', ';', '▁', '}']
[50, 25, 3, 79, 38, 75, 20, 75, 25, 42, 60, 116, 32, 96, 3, 182, 45, 19, 66, 6, 58, 38, 60, 116, 32, 10, 3, 61, 3, 34, 60, 75, 25, 82, 20, 120, 0, 12, 120, 96, 3, 120, 198, 30, 136, 45, 3, 70, 45, 60, 44, 66, 3, 120, 10, 11, 3, 92, 25, 31, 60, 43, 107, 11, 3, 62]


In [None]:
# decode: id => text
print(sp.decode_pieces(['▁This', '▁is', '▁a', '▁t', 'est']))
print(sp.decode_ids([18, 135, 12]))

▁This▁is a▁test
This


In [None]:
# returns vocab size
print(sp.get_piece_size())

# id <=> piece conversion
print(sp.id_to_piece(18))
print(sp.piece_to_id('T'))

# returns 0 for unknown tokens (we can change the id for UNK)
print(sp.piece_to_id('__MUST_BE_UNKNOWN__'))

# , ,  are defined by default. Their ids are (0, 1, 2)
#  and  are defined as 'control' symbol.
for id in range(3):
  print(sp.id_to_piece(id), sp.is_control(id))

200
T
18
0
<unk> False
<s> True
</s> True
