|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[2] Book lengths in characters, words, and tokens</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import requests
import matplotlib.pyplot as plt

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300,
})

# **Part 1: Number of tokens**

In [None]:
# OpenAI cl100k_base tokenizer (used by GPT-4)
import tiktoken
tokenizer_4 = tiktoken.get_encoding('cl100k_base')

In [None]:
# GPT2's tokenizer
from transformers import AutoTokenizer
tokenizer_2 = AutoTokenizer.from_pretrained('gpt2')

In [None]:
# check all attributes
dir(tokenizer_4)

In [None]:
print(f'GPT-4 tokenizer has {tokenizer_4.n_vocab:7,} tokens.')
print(f'GPT-2 tokenizer has {tokenizer_2.vocab_size:7,} tokens.')

# **Part 2: Token byte length distributions**

In [None]:
# using initialization and a for-loop
lengths_2 = np.zeros(tokenizer_2.vocab_size,dtype=int)
for t in range(tokenizer_2.vocab_size):
  lengths_2[t] = len(tokenizer_2.decode(t).encode('utf-8'))

# using list-comprehension
#  (note: this is a list whereas the loop version is a numpy array, though that doesn't matter for this project)
lengths_2 = [ len(tokenizer_2.decode(t).encode('utf-8')) for t in range(tokenizer_2.vocab_size) ]

In [None]:
# initialize
lengths_4 = np.zeros(tokenizer_4.n_vocab,dtype=int)

# get the lengths
for t in range(tokenizer_4.n_vocab):

  # some token IDs cannot be decoded in isolation
  try:
    lengths_4[t] = len(tokenizer_4.decode([t]).encode('utf-8'))
  except:
    lengths_4[t] = -1

In [None]:
# bin counts for lengths
bincounts_2 = np.bincount(lengths_2)
bincounts_4 = np.bincount(lengths_4[lengths_4>-1])

# note: first element in np.bincount output is for count=0, which we can ignore

# plot
plt.figure(figsize=(10,4))
plt.plot(range(1,max(lengths_4)+1),bincounts_4[1:]/bincounts_4.max(),'s-',color=[.3,.9,.3],markerfacecolor=[.7,.9,.7],label='GPT-4')
plt.plot(range(1,max(lengths_2)+1),bincounts_2[1:]/bincounts_2.max(),'o-',color=[.3,.3,.9],markerfacecolor=[.7,.7,.9],label='GPT-2')

plt.legend(fontsize=14)
plt.gca().set(xlabel='Token length (byte)',ylabel='Density',
              title='Distributions of token lengths',yscale='log',xscale='log')

plt.tight_layout()
plt.savefig('ch2_proj7_part2.png')
plt.show()

# **Part 3: Text token lengths**

In [None]:
# all books have the same url format;
# they are unique by numerical code
baseurl = 'https://www.gutenberg.org/cache/epub/'

bookurls = [
    # code       title
    ['84',    'Frankenstein'    ],
    ['64317', 'GreatGatsby'     ],
    ['11',    'AliceWonderland' ],
    ['1513',  'RomeoJuliet'     ],
    ['76',    'HuckFinn'        ],
    ['219',   'HeartDarkness'   ],
    ['2591',  'GrimmsTales'     ],
    ['2148',  'EdgarAllenPoe'   ],
    ['36',    'WarOfTheWorlds'  ],
    ['829',   'GulliversTravels']
]

In [None]:
# initialize
tokens_2 = np.zeros(len(bookurls))
tokens_4 = np.zeros(len(bookurls))
token_lens_2 = np.zeros(len(bookurls))
token_lens_4 = np.zeros(len(bookurls))


# loop over books
for i,(code,title) in enumerate(bookurls):

  # get the text
  fullurl = baseurl + code + '/pg' + code + '.txt'
  text = requests.get(fullurl).text

  # tokenize the text
  gpt2_toks = tokenizer_2.encode(text)
  gpt4_toks = tokenizer_4.encode(text)

  # count the numbers of tokens
  tokens_2[i] = len( gpt2_toks )
  tokens_4[i] = len( gpt4_toks )

  # count the average lengths of the tokens
  token_lens_2[i] = np.mean([len(tokenizer_2.decode(t).encode('utf-8'))   for t in gpt2_toks])
  token_lens_4[i] = np.mean([len(tokenizer_4.decode([t]).encode('utf-8')) for t in gpt4_toks])


In [None]:
# setup the figure
_,axs = plt.subplots(1,2,figsize=(10,4))

### left panel: token counts

# the scatter plot
axs[0].plot(tokens_2,tokens_4,'kh',markerfacecolor=[.9,.7,.7,.7],markersize=12)

# line of unity
minlength = np.min([np.min(tokens_2),np.min(tokens_4)])
maxlength = np.max([np.max(tokens_2),np.max(tokens_4)])
axs[0].plot([minlength,maxlength],[minlength,maxlength],'--',color=[.5,.5,.5])

# stylize
axs[0].set(xlabel='GPT-2 tokens',ylabel='GPT-4 tokens',title='A) Book lengths in tokens')
axs[0].ticklabel_format(style='scientific',axis='both',scilimits=(0,0))


### right panel: token lengths
axs[1].plot(token_lens_2,token_lens_4,'ks',markerfacecolor=[.7,.9,.7,.7],markersize=12)

minlength = np.min([np.min(token_lens_2),np.min(token_lens_4)])
maxlength = np.max([np.max(token_lens_2),np.max(token_lens_4)])
axs[1].plot([minlength,maxlength],[minlength,maxlength],'--',color=[.5,.5,.5])

# stylize
axs[1].set(xlabel='GPT-2 token lengths',ylabel='GPT-4 token lengths',title='B) Average token lengths (bytes)')


plt.tight_layout()
plt.savefig('ch2_proj7_part3.png')
plt.show()

# **Part 4: Translator functions**

In [None]:
# translation functions
def gpt2_to_4(toks):
  newtxt  = tokenizer_2.decode(toks)
  newtoks = tokenizer_4.encode(newtxt)
  return newtoks

# this function has more compact code but is less human-readable
def gpt4_to_2(toks):
  return tokenizer_2.encode( tokenizer_4.decode(toks) )

In [None]:
text = 'Canadian winters are kept WArM bY the friendliness of THE PEOPLE.'
print('Original text:\n ',text,'\n')

# tokenize
toks_2 = tokenizer_2.encode(text)
toks_4 = tokenizer_4.encode(text)

# print the original tokens
print('GPT-2 tokens:\n ',toks_2)
print('GPT-4 tokens:\n ',toks_4,'\n')

# show that the decoding reconstructs the original text
print('GPT-2 reconstruction:\n ',tokenizer_2.decode(toks_2))
print('GPT-4 reconstruction:\n ',tokenizer_4.decode(toks_4),'\n')

# and finally, translate between tokenizers
print('GPT-2 to GPT-4 translation:\n ',tokenizer_4.decode(gpt2_to_4(toks_2)))
print('GPT-4 to GPT-2 translation:\n ',tokenizer_2.decode(gpt4_to_2(toks_4)))