|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[2] Book lengths in characters, words, and tokens</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import requests
import matplotlib.pyplot as plt

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300,
})

# **How to import a book from gutenberg.org**

In [None]:
url = 'https://www.gutenberg.org/cache/epub/84/pg84.txt'
text = requests.get(url).text
print(text)

# **Part 1: Total characters, words, and tokens**

In [None]:
# GPT2 tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
# all books have the same url format;
# they are unique by numerical code
baseurl = 'https://www.gutenberg.org/cache/epub/'

bookurls = [
    # code       title
    ['84',    'Frankenstein'    ],
    ['64317', 'GreatGatsby'     ],
    ['11',    'AliceWonderland' ],
    ['1513',  'RomeoJuliet'     ],
    ['76',    'HuckFinn'        ],
    ['219',   'HeartDarkness'   ],
    ['2591',  'GrimmsTales'     ],
    ['2148',  'EdgarAllenPoe'   ],
    ['36',    'WarOfTheWorlds'  ],
    ['829',   'GulliversTravels']
]

In [None]:
print('  Book title     |  Chars  |  Words  |  Tokens')
print('-----------------+---------+---------+---------')

for code,title in bookurls:

  # get the text
  fullurl = baseurl + code + '/pg' + code + '.txt'
  text = requests.get(fullurl).text

  # counts of different token types
  n_chars = len(text)
  n_words = len(text.split())
  n_token = len(tokenizer.encode(text))

  # print the results
  print(f'{title:16} | {n_chars:>7,d} | {n_words:>7,d} | {n_token:>7,d}')

# **Part 2: Unique and total counts**

In [None]:
# Note: The books are redundantly downloaded.
# In principle it would be more efficient to download only once and then store the texts in Python,
# but the code is fast and there aren't many books, so it's OK here :P

In [None]:
# initialize
totals = np.zeros((3,len(bookurls)),dtype=int)
uniques = np.zeros((3,len(bookurls)),dtype=int)

# loop over books
for i,(code,title) in enumerate(bookurls):

  # get the text
  fullurl = baseurl + code + '/pg' + code + '.txt'
  text = requests.get(fullurl).text

  # count the total and unique characters
  totals[0,i]  = len(text)
  uniques[0,i] = len(set(text)) # only need the count, so it doesn't need to be sorted

  # repeat for words
  totals[1,i]  = len(text.split())
  uniques[1,i] = len(set(text.split()))

  # and for tokens
  tokens = tokenizer.encode(text)
  totals[2,i]  = len(tokens)
  uniques[2,i] = len(set(tokens))

In [None]:
_,axs = plt.subplots(1,2,figsize=(10,3))

# marker colors and shapes
colors = [ [.9,.7,.7,.7],[.7,.9,.7,.7],[.7,.7,.9,.7] ]
shapes = 'hso'

# loop over the three tokenizer features
for i in range(3):

  # draw the individual data points
  axs[0].plot(np.random.normal(i,.06,len(bookurls)),totals[i,:],'k'+shapes[i],
              markerfacecolor=colors[i],markersize=10)

  # and the mean
  axs[0].plot([i-.5,i+.5],np.mean(totals[i,:])*np.ones(2),
              color=colors[i],linewidth=3)


  # repeat for unique counts
  axs[1].plot(np.random.normal(i,.06,len(bookurls)),uniques[i,:],'k'+shapes[i],
              markerfacecolor=colors[i],markersize=10)
  axs[1].plot([i-.5,i+.5],np.mean(uniques[i,:])*np.ones(2),
              color=colors[i],linewidth=3)

# axis adjustments
axs[0].set(xticks=[0,1,2],xticklabels=['Characters','Words','Tokens'],
           xlim=[-1,3],ylabel='Count',title='A) Total')
axs[1].set(xticks=[0,1,2],xticklabels=['Characters','Words','Tokens'],
           xlim=[-1,3],ylabel='Count',title='B) Unique')


axs[0].ticklabel_format(style='scientific',axis='y',scilimits=(0,0))
axs[1].ticklabel_format(style='scientific',axis='y',scilimits=(0,0))

plt.tight_layout()
plt.savefig('ch2_proj2_part2.png')
plt.show()

# **Part 3: Unique by total counts**

In [None]:
_,axs = plt.subplots(1,3,figsize=(12,3.5))

markers = [ 'kh','ko','ks' ]
labels = [ 'Characters','Words','Tokens' ]

for i in range(3):

  # pick the color for these markers
  color = [.7,.7,.7]
  color[i] = .9

  # correlation between total and unique count
  r = np.corrcoef(uniques[i,:],totals[i,:])[0,1]
  title = f'{labels[i]} (r = {r:.2f})'

  # make the plot
  axs[i].plot(uniques[i,:],totals[i,:],markers[i],markerfacecolor=color,markersize=12)
  axs[i].set(xlabel='Unique',ylabel='Total',title=title)

  # scientific notation in plot labels (except for characters)
  if i==0:
    axs[i].ticklabel_format(style='scientific',axis='y',scilimits=(0,0))
  else:
    axs[i].ticklabel_format(style='scientific',axis='both',scilimits=(0,0))


plt.tight_layout()
plt.savefig('ch2_proj2_part3.png')
plt.show()

In [None]:
# which book is the outlier?
bookurls[uniques[0,:].argmax()][1]

In [None]:
# direct link to the text:
# https://www.gutenberg.org/cache/epub/2148/pg2148.txt
uniques[0,7]

In [None]:
# check all the characters
text = requests.get('https://www.gutenberg.org/cache/epub/2148/pg2148.txt').text
sorted(set(text))