|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[2] Book lengths in characters, words, and tokens</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import requests
import matplotlib.pyplot as plt

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300,
})

# **Part 1: Number of tokens**

In [None]:
# OpenAI cl100k_base tokenizer (used by GPT-4)
import tiktoken
tokenizer_4 = tiktoken.get_encoding('cl100k_base')

In [None]:
# GPT2's tokenizer
from transformers import AutoTokenizer
tokenizer_2 = AutoTokenizer.from_pretrained('gpt2')

In [None]:
# check all attributes
dir(tokenizer_4)

In [None]:
print(f'GPT-4 tokenizer has {} tokens.')
print(f'GPT-2 tokenizer has {} tokens.')

# **Part 2: Token byte length distributions**

In [None]:
# using initialization and a for-loop
lengths_2 = np.zeros()
for t in range():
  lengths_2[t] =


In [None]:
# initialize
lengths_4 = np.zeros()

# get the lengths
for t in range():

  # some token IDs cannot be decoded in isolation
  try:
    lengths_4[t] =

In [None]:
# bin counts for lengths
bincounts_2 = np.bincount
bincounts_4 =

# note: first element in np.bincount output is for count=0, which we can ignore

# plot
plt.figure(figsize=(10,4))
plt.plot(range(1,),bincounts_4[1:]/ .max(),'s-',color=[.3,.9,.3],markerfacecolor=[.7,.9,.7],label='GPT-4')
plt.plot(,,'o-',color=[.3,.3,.9],markerfacecolor=[.7,.7,.9],label='GPT-2')

plt.legend(fontsize=14)
plt.gca().set(xlabel='Token length (byte)',ylabel='Density',
              title='Distributions of token lengths',yscale='log',xscale='log')

plt.tight_layout()
plt.savefig('ch2_proj7_part2.png')
plt.show()

# **Part 3: Text token lengths**

In [None]:
# all books have the same url format;
# they are unique by numerical code
baseurl = 'https://www.gutenberg.org/cache/epub/'

bookurls = [
    # code       title
    ['84',    'Frankenstein'    ],
    ['64317', 'GreatGatsby'     ],
    ['11',    'AliceWonderland' ],
    ['1513',  'RomeoJuliet'     ],
    ['76',    'HuckFinn'        ],
    ['219',   'HeartDarkness'   ],
    ['2591',  'GrimmsTales'     ],
    ['2148',  'EdgarAllenPoe'   ],
    ['36',    'WarOfTheWorlds'  ],
    ['829',   'GulliversTravels']
]

In [None]:
# initialize
tokens_2 = np.zeros(len(bookurls))
tokens_4 = np.zeros(len(bookurls))
token_lens_2 = np.zeros(len(bookurls))
token_lens_4 = np.zeros(len(bookurls))


# loop over books
for

  # get the text
  fullurl =
  text = requests.get(fullurl)

  # tokenize the text
  gpt2_toks = tokenizer_2
  gpt4_toks = tokenizer_4

  # count the numbers of tokens
  tokens_2[i] =
  tokens_4[i] =

  # count the average lengths of the tokens
  token_lens_2[i] = np.mean
  token_lens_4[i] =

In [None]:
# setup the figure
_,axs = plt.subplots(1,2,figsize=(10,4))

### left panel: token counts

# the scatter plot
axs[0].plot(,,'kh',markerfacecolor=[.9,.7,.7,.7],markersize=12)

# line of unity

# stylize
axs[0].set(xlabel='GPT-2 tokens',ylabel='GPT-4 tokens',title='A) Book lengths in tokens')
axs[0].ticklabel_format(style='scientific',axis='both',scilimits=(0,0))


### right panel: token lengths
axs[1].plot(


# stylize
axs[1].set(xlabel='GPT-2 token lengths',ylabel='GPT-4 token lengths',title='B) Average token lengths (bytes)')


plt.tight_layout()
plt.savefig('ch2_proj7_part3.png')
plt.show()

# **Part 4: Translator functions**

In [None]:
# translation functions
def gpt2_to_4(toks):
  newtxt  = tokenizer_2.decode
  newtoks = tokenizer_4
  return newtoks


def gpt4_to_2(toks):


In [None]:
text = 'Canadian winters are kept WArM bY the friendliness of THE PEOPLE.'
print('Original text:\n ',text,'\n')

# tokenize
toks_2 = tokenizer_2.encode(text)
toks_4 = tokenizer_4.encode(text)

# print the original tokens
print('GPT-2 tokens:\n ',
print('GPT-4 tokens:\n ',

# show that the decoding reconstructs the original text
print('GPT-2 reconstruction:\n ',
print('GPT-4 reconstruction:\n ',

# and finally, translate between tokenizers
print('GPT-2 to GPT-4 translation:\n ',
print('GPT-4 to GPT-2 translation:\n ',