|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[1] Three tokenization schemes</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300,
})

# **Part 1: Character-based tokenization**

In [None]:
# start with a text as one string variable
txt = 'The way you do anything is the way you do everything.'

# use list-comprehension to create a list of characters
characters =

# print the two versions
print('The full text:\n',
print('As a list of characters:\n',

# the vocab (sorted set of unique elements, then make it a list)
char_vocab =
print('The vocabulary is:\n',

# print some numerical info
print(f'There are {} characters, {} of which are unique.')

In [None]:
# the tokens and their indices

# vertical format
# for index,tok in enumerate(char_vocab):
#   print(f'Index {index:2} is "{tok}"')

# horizontal format is better for the book figure ;)
print('Token :',end='')
for v in char_vocab:
  print(f'', end='')
print('\n','-'*131)


In [None]:
# initialize a list
tokens =

# loop through text, find the indices into the vocab
# this is called encoding
for i,c in enumerate(txt):
  tokens[i] =

print(tokens)

In [None]:
# create a figure
_,ax = plt.subplots(1,figsize=(12,4))

# plot the tokens
ax.plot(tokens,'ks',markersize=12,markerfacecolor=[.7,.7,.9])
ax.set(xlabel='Character index',yticks=range(len(char_vocab)))
ax.grid(linestyle='--',axis='y')

# invisible axis for right-hand-side labels


plt.tight_layout()
plt.savefig('ch2_proj1_part1-tokenscatter.png')
plt.show()

# **Part 2: Word-based tokenization**

In [None]:
# split into words
words =

# print the words
print('The list of words is:\n',

# find the unique words, and print numerical info
word_vocab =
print(f'There are {} words, {} of which are unique.\n')

print('The vocab is:\n',word_vocab)

In [None]:
# initialize a list
tokens = [0]*len(words)

# loop through text, find the indices into the vocab
for i,c in enumerate(words):
  tokens[i] =

print(tokens)

In [None]:
# create a figure
_,ax = plt.subplots(1,figsize=(10,3))

# plot the tokens
ax.plot(tokens,'ks',markersize=15,markerfacecolor=[.7,.7,.9])

# invisible axis for right-hand-side labels

plt.tight_layout()
plt.savefig('ch2_proj1_part2.png')
plt.show()

# **Part 3: OpenAI's GPT2 tokenizer**

In [None]:
# GPT2 tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
tokenizer.

In [None]:
gpt2_tokens = tokenizer.encode(txt)

for token in gpt2_tokens:
  print(f'Token {token:4} is "{}"')

In [None]:
# leading spaces
word1 = ' Mike'
word2 = 'Mike'

for w in [word1,word2]:
  print(f'Token for "{w}" is {}')

In [None]:
print(f'There are {} tokens, {} of which are unique.')

In [None]:
_,axs = plt.subplots(1,2,figsize=(10,3))

# bars of total counts
axs[0].bar(0,len(characters))
axs[0].bar(1,len(words))
axs[0].bar(2,len(gpt2_tokens))
axs[0].set(xticks=[0,1,2],xticklabels=['Characters','Words','GPT-2'],
           ylabel='Count',title='Total')

# bars of unique counts

axs[1].set(xticks=[0,1,2],xticklabels=['Characters','Words','GPT-2'],
           ylabel='Count',title='Unique')

plt.tight_layout()
plt.savefig('ch2_proj1_part3.png')
plt.show()