|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[2] Book lengths in characters, words, and tokens</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import requests
import matplotlib.pyplot as plt

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
})

# Import the two tokenizers

In [None]:
# GPT2's tokenizer
from transformers import AutoTokenizer
tokenizer_G = AutoTokenizer.from_pretrained('gpt2')

In [None]:
# load BERT tokenizer and model
from transformers import BertTokenizer
tokenizer_B = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# note about additional tokens:
text = 'Peanut and strawberry'
tokens = tokenizer_B.encode(text)


print(f'Original text:\n {text}\n')
print(f'Token sequence:\n {tokens}\n')

for t in tokens:
  print(f'Token index {t:6} is "{tokenizer_B.decode(t)}"')

In [None]:
# ignoring by slicing
print(tokenizer_B.decode(tokens[1:-1]))

# ignoring by input argument
tokens = tokenizer_B.encode(text,add_special_tokens=False)
print(tokenizer_B.decode(tokens))

# **Part 1: Number of tokens**

In [None]:
dir(tokenizer_B)

In [None]:
print(f'BERT tokenizer has {tokenizer_B.vocab_size:,} tokens.')
print(f'GPT2 tokenizer has {tokenizer_G.vocab_size:,} tokens.')

# **Part 2: Token byte length distributions**

In [None]:
# initialize
lengths_B = np.zeros(tokenizer_B.vocab_size,dtype=int)

# get all the lengths
for t in range(tokenizer_B.vocab_size):
  lengths_B[t] = len(tokenizer_B.decode(t).encode('utf-8'))

In [None]:
# initialize
lengths_G = np.zeros(tokenizer_G.vocab_size,dtype=int)

# get all the lengths
for t in range(tokenizer_G.vocab_size):
  lengths_G[t] = len(tokenizer_G.decode(t).encode('utf-8'))

In [None]:
# bin counts for lengths
bincounts_B = np.bincount(lengths_B)
bincounts_G = np.bincount(lengths_G)

# note: first element in np.bincount output is for count=0, which we can ignore

# plot
plt.figure(figsize=(10,4))
plt.plot(range(1,max(lengths_B)+1),bincounts_B[1:]/bincounts_B.max(),'s-',color=[.3,.9,.3],markerfacecolor=[.7,.9,.7],label='BERT')
plt.plot(range(1,max(lengths_G)+1),bincounts_G[1:]/bincounts_G.max(),'o-',color=[.3,.3,.9],markerfacecolor=[.7,.7,.9],label='GPT2')

plt.legend(fontsize=14)
plt.gca().set(xlabel='Token length (byte)',ylabel='Density',
              title='Distributions of token lengths',yscale='log',xscale='log')

plt.show()

# **Part 3: Text token length**

In [None]:
# all books have the same url format;
# they are unique by numerical code
baseurl = 'https://www.gutenberg.org/cache/epub/'

bookurls = [
    # code       title
    ['84',    'Frankenstein'    ],
    ['64317', 'GreatGatsby'     ],
    ['11',    'AliceWonderland' ],
    ['1513',  'RomeoJuliet'     ],
    ['76',    'HuckFinn'        ],
    ['219',   'HeartDarkness'   ],
    ['2591',  'GrimmsTales'     ],
    ['2148',  'EdgarAllenPoe'   ],
    ['36',    'WarOfTheWorlds'  ],
    ['829',   'GulliversTravels']
]

In [None]:
# initialize
tokens_B = np.zeros(len(bookurls))
tokens_G = np.zeros(len(bookurls))
token_lens_B = np.zeros(len(bookurls))
token_lens_G = np.zeros(len(bookurls))


# loop over books
for i,(code,title) in enumerate(bookurls):

  # get the text
  fullurl = baseurl + code + '/pg' + code + '.txt'
  text = requests.get(fullurl).text

  # tokenize the text
  brt_toks = tokenizer_B.encode(text,add_special_tokens=False)
  gpt_toks = tokenizer_G.encode(text)

  # count the numbers of tokens
  tokens_B[i] = len( brt_toks )
  tokens_G[i] = len( gpt_toks )

  # count the average lengths of the tokens
  token_lens_B[i] = np.mean([len(tokenizer_B.decode(t).encode('utf-8')) for t in brt_toks])
  token_lens_G[i] = np.mean([len(tokenizer_G.decode(t).encode('utf-8')) for t in gpt_toks])


In [None]:
# setup the figure
_,axs = plt.subplots(1,2,figsize=(10,4))

### left panel: token counts

# the scatter plot
axs[0].plot(tokens_B,tokens_G,'kh',markerfacecolor=[.9,.7,.7,.7],markersize=12)

# line of unity
minlength = np.min([np.min(tokens_B),np.min(tokens_G)])
maxlength = np.max([np.max(tokens_B),np.max(tokens_G)])
axs[0].plot([minlength,maxlength],[minlength,maxlength],'--',color=[.5,.5,.5])

# stylize
axs[0].set(xlabel='BERT tokens',ylabel='GPT2 tokens',title='Book lengths in tokens')
axs[0].ticklabel_format(style='scientific',axis='both',scilimits=(0,0))


### right panel: token lengths
axs[1].plot(token_lens_B,token_lens_G,'ks',markerfacecolor=[.7,.9,.7,.7],markersize=12)

minlength = np.min([np.min(token_lens_B),np.min(token_lens_G)])
maxlength = np.max([np.max(token_lens_B),np.max(token_lens_G)])
axs[1].plot([minlength,maxlength],[minlength,maxlength],
            '--',zorder=-10,color=[.5,.5,.5])

# stylize
axs[1].set(xlabel='BERT tokens',ylabel='GPT2 tokens',title='Average token lengths (bytes)')


plt.tight_layout()
plt.show()

# **Part 4: Translator functions**

In [None]:
# translation functions
def GPT_to_BERT(toks):
  newtxt  = tokenizer_G.decode(toks)
  newtoks = tokenizer_B.encode(newtxt,add_special_tokens=False)
  return newtoks

def BERT_to_GPT(t):
  return tokenizer_G.encode( tokenizer_B.decode(t),add_special_tokens=False )

In [None]:
text = 'Canadian winters are kept WArM bY the friendliness of THE PEOPLE.'
print('Original text:\n ',text,'\n')

# tokenize
toks_B = tokenizer_B.encode(text,add_special_tokens=False)
toks_G = tokenizer_G.encode(text)

# print the original tokens
print('BERT tokens:\n ',toks_B)
print('GPT2 tokens:\n ',toks_G,'\n')

# is the decoding perfect?
print('BERT reconstruction:\n ',tokenizer_B.decode(toks_B))
print('GPT2 reconstruction:\n ',tokenizer_G.decode(toks_G),'\n')

# and finally, translate between tokenizers
print('BERT to GPT2 translation:\n ',tokenizer_G.decode(BERT_to_GPT(toks_B)))
print('GPT2 to BERT translation:\n ',tokenizer_B.decode(GPT_to_BERT(toks_G)))